aboutsummaryrefslogtreecommitdiff
path: root/test
diff options
context:
space:
mode:
authorDavid L. Jones <dlj@google.com>2017-11-15 01:40:05 +0000
committerDavid L. Jones <dlj@google.com>2017-11-15 01:40:05 +0000
commitd5c2cca72463233df77a065f201db31b140eb44d (patch)
tree3f9a978131033302a58b7db7db1ecf2a4622bad2 /test
parentce7676b8db6bac096dad4c4ad62e9e6bb8aa1064 (diff)
parentdcf64df89bc6d775e266ebd6b0134d135f47a35b (diff)
Creating branches/google/testing and tags/google/testing/2017-11-14 from r317716testing
git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/google/testing@318248 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test')
-rw-r--r--test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll208
-rw-r--r--test/Analysis/CostModel/X86/interleaved-load-float.ll141
-rw-r--r--test/Assembler/fast-math-flags.ll32
-rw-r--r--test/Bitcode/compatibility-3.6.ll4
-rw-r--r--test/Bitcode/compatibility-3.7.ll4
-rw-r--r--test/Bitcode/compatibility-3.8.ll8
-rw-r--r--test/Bitcode/compatibility-3.9.ll8
-rw-r--r--test/Bitcode/compatibility-4.0.ll8
-rw-r--r--test/Bitcode/compatibility-5.0.ll8
-rw-r--r--test/Bitcode/compatibility.ll4
-rw-r--r--test/Bitcode/thinlto-summary-local-5.0.ll22
-rw-r--r--test/Bitcode/thinlto-summary-local-5.0.ll.bcbin0 -> 1028 bytes
-rw-r--r--test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll67
-rw-r--r--test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir104
-rw-r--r--test/CodeGen/AArch64/GlobalISel/legalize-add.mir91
-rw-r--r--test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir19
-rw-r--r--test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir4
-rw-r--r--test/CodeGen/AArch64/GlobalISel/select-int-ext.mir6
-rw-r--r--test/CodeGen/AArch64/dwarf-cfi.ll36
-rw-r--r--test/CodeGen/AArch64/recp-fastmath.ll34
-rw-r--r--test/CodeGen/AArch64/sqrt-fastmath.ll83
-rw-r--r--test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir12
-rw-r--r--test/CodeGen/AMDGPU/detect-dead-lanes.mir44
-rw-r--r--test/CodeGen/AMDGPU/mad_64_32.ll168
-rw-r--r--test/CodeGen/AMDGPU/mul.ll125
-rw-r--r--test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir32
-rw-r--r--test/CodeGen/AMDGPU/private-memory-r600.ll249
-rw-r--r--test/CodeGen/AMDGPU/simplify-libcalls.ll122
-rw-r--r--test/CodeGen/AMDGPU/unknown-processor.ll9
-rw-r--r--test/CodeGen/AMDGPU/unsupported-calls.ll4
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir35
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir16
-rw-r--r--test/CodeGen/Generic/llc-start-stop.ll6
-rw-r--r--test/CodeGen/Hexagon/isel-prefer.ll10
-rw-r--r--test/CodeGen/MIR/X86/subregister-index-operands.mir6
-rw-r--r--test/CodeGen/Mips/brind-tailcall.ll60
-rw-r--r--test/CodeGen/Mips/dins.ll14
-rw-r--r--test/CodeGen/Mips/msa/emergency-spill.mir221
-rw-r--r--test/CodeGen/Mips/msa/frameindex.ll49
-rw-r--r--test/CodeGen/Mips/tailcall/tailcall.ll15
-rw-r--r--test/CodeGen/NVPTX/atomics-sm60.ll19
-rw-r--r--test/CodeGen/NVPTX/generic-to-nvvm-ir.ll2
-rw-r--r--test/CodeGen/PowerPC/bswap64.ll13
-rw-r--r--test/CodeGen/PowerPC/p9-vinsert-vextract.ll822
-rw-r--r--test/CodeGen/PowerPC/subreg-postra-2.ll8
-rw-r--r--test/CodeGen/RISCV/alu32.ll1
-rw-r--r--test/CodeGen/RISCV/branch.ll121
-rw-r--r--test/CodeGen/RISCV/calls.ll83
-rw-r--r--test/CodeGen/RISCV/imm.ll47
-rw-r--r--test/CodeGen/RISCV/mem.ll202
-rw-r--r--test/CodeGen/RISCV/wide-mem.ll34
-rw-r--r--test/CodeGen/WebAssembly/inline-asm-m.ll13
-rw-r--r--test/CodeGen/WebAssembly/inline-asm.ll56
-rw-r--r--test/CodeGen/WebAssembly/signext-arg.ll22
-rw-r--r--test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll1
-rw-r--r--test/CodeGen/X86/2011-10-19-widen_vselect.ll1
-rw-r--r--test/CodeGen/X86/GlobalISel/add-scalar.ll1
-rw-r--r--test/CodeGen/X86/GlobalISel/brcond.ll1
-rw-r--r--test/CodeGen/X86/GlobalISel/callingconv.ll21
-rw-r--r--test/CodeGen/X86/GlobalISel/frameIndex.ll1
-rw-r--r--test/CodeGen/X86/GlobalISel/select-cmp.mir26
-rw-r--r--test/CodeGen/X86/GlobalISel/select-copy.mir6
-rw-r--r--test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir10
-rw-r--r--test/CodeGen/X86/GlobalISel/select-ext.mir12
-rw-r--r--test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir2
-rw-r--r--test/CodeGen/X86/O0-pipeline.ll1
-rw-r--r--test/CodeGen/X86/TruncAssertZext.ll1
-rw-r--r--test/CodeGen/X86/avg.ll185
-rw-r--r--test/CodeGen/X86/avx-basic.ll7
-rw-r--r--test/CodeGen/X86/avx-intrinsics-x86.ll52
-rw-r--r--test/CodeGen/X86/avx-schedule.ll8
-rw-r--r--test/CodeGen/X86/avx512-mask-op.ll10
-rw-r--r--test/CodeGen/X86/avx512-regcall-Mask.ll22
-rw-r--r--test/CodeGen/X86/avx512-regcall-NoMask.ll17
-rwxr-xr-xtest/CodeGen/X86/avx512-schedule.ll4
-rw-r--r--test/CodeGen/X86/avx512-select.ll1
-rwxr-xr-xtest/CodeGen/X86/avx512-shuffle-schedule.ll736
-rw-r--r--test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll368
-rw-r--r--test/CodeGen/X86/avx512-skx-insert-subvec.ll2
-rw-r--r--test/CodeGen/X86/avx512-vbroadcast.ll2
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll2
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll12
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics.ll4
-rw-r--r--test/CodeGen/X86/avx512bw-vec-test-testn.ll32
-rw-r--r--test/CodeGen/X86/avx512bwvl-vec-test-testn.ll64
-rw-r--r--test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll37
-rw-r--r--test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll23
-rw-r--r--test/CodeGen/X86/avx512cd-intrinsics.ll22
-rw-r--r--test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll44
-rw-r--r--test/CodeGen/X86/avx512cdvl-intrinsics.ll43
-rw-r--r--test/CodeGen/X86/avx512f-vec-test-testn.ll32
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll30
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll4
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics.ll12
-rw-r--r--test/CodeGen/X86/avx512vl-vbroadcast.ll3
-rw-r--r--test/CodeGen/X86/avx512vl-vec-masked-cmp.ll520
-rw-r--r--test/CodeGen/X86/avx512vl-vec-test-testn.ll128
-rw-r--r--test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll75
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-256.ll1
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-512.ll4
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll6
-rw-r--r--test/CodeGen/X86/bitcast-setcc-256.ll1
-rw-r--r--test/CodeGen/X86/bitcast-setcc-512.ll4
-rw-r--r--test/CodeGen/X86/bool-vector.ll3
-rw-r--r--test/CodeGen/X86/broadcastm-lowering.ll7
-rw-r--r--test/CodeGen/X86/cmp.ll3
-rw-r--r--test/CodeGen/X86/combine-srl.ll2
-rw-r--r--test/CodeGen/X86/compress_expand.ll4
-rw-r--r--test/CodeGen/X86/emutls-pie.ll6
-rw-r--r--test/CodeGen/X86/emutls.ll16
-rw-r--r--test/CodeGen/X86/epilogue-cfi-fp.ll43
-rw-r--r--test/CodeGen/X86/epilogue-cfi-no-fp.ll46
-rw-r--r--test/CodeGen/X86/f16c-intrinsics.ll271
-rw-r--r--test/CodeGen/X86/fast-isel-int-float-conversion.ll12
-rw-r--r--test/CodeGen/X86/fast-isel-store.ll10
-rw-r--r--test/CodeGen/X86/fma-intrinsics-x86.ll874
-rw-r--r--test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll18
-rw-r--r--test/CodeGen/X86/frame-lowering-debug-intrinsic.ll4
-rw-r--r--test/CodeGen/X86/haddsub-2.ll12
-rw-r--r--test/CodeGen/X86/hipe-cc64.ll1
-rw-r--r--test/CodeGen/X86/horizontal-reduce-smax.ll1896
-rw-r--r--test/CodeGen/X86/horizontal-reduce-smin.ll1898
-rw-r--r--test/CodeGen/X86/horizontal-reduce-umax.ll2203
-rw-r--r--test/CodeGen/X86/horizontal-reduce-umin.ll2207
-rw-r--r--test/CodeGen/X86/illegal-bitfield-loadstore.ll1
-rw-r--r--test/CodeGen/X86/imul.ll3
-rw-r--r--test/CodeGen/X86/inline-asm-A-constraint.ll3
-rw-r--r--test/CodeGen/X86/lea-opt-cse1.ll1
-rw-r--r--test/CodeGen/X86/lea-opt-cse2.ll2
-rw-r--r--test/CodeGen/X86/lea-opt-cse3.ll2
-rw-r--r--test/CodeGen/X86/lea-opt-cse4.ll3
-rw-r--r--test/CodeGen/X86/legalize-shift-64.ll5
-rw-r--r--test/CodeGen/X86/live-out-reg-info.ll1
-rw-r--r--test/CodeGen/X86/load-combine.ll2
-rw-r--r--test/CodeGen/X86/masked_gather_scatter.ll34
-rw-r--r--test/CodeGen/X86/masked_memop.ll12
-rw-r--r--test/CodeGen/X86/memcmp-optsize.ll224
-rw-r--r--test/CodeGen/X86/memcmp.ll240
-rw-r--r--test/CodeGen/X86/memset-nonzero.ll1
-rw-r--r--test/CodeGen/X86/merge-consecutive-loads-128.ll19
-rw-r--r--test/CodeGen/X86/movtopush.ll4
-rw-r--r--test/CodeGen/X86/mul-constant-result.ll59
-rw-r--r--test/CodeGen/X86/mul-i256.ll8
-rw-r--r--test/CodeGen/X86/mul128.ll5
-rw-r--r--test/CodeGen/X86/no-plt.ll30
-rw-r--r--test/CodeGen/X86/pop-stack-cleanup-msvc.ll26
-rw-r--r--test/CodeGen/X86/pr21792.ll1
-rw-r--r--test/CodeGen/X86/pr29061.ll2
-rw-r--r--test/CodeGen/X86/pr29112.ll1
-rw-r--r--test/CodeGen/X86/pr30430.ll1
-rw-r--r--test/CodeGen/X86/pr32241.ll2
-rw-r--r--test/CodeGen/X86/pr32256.ll1
-rw-r--r--test/CodeGen/X86/pr32282.ll1
-rw-r--r--test/CodeGen/X86/pr32284.ll16
-rw-r--r--test/CodeGen/X86/pr32329.ll4
-rw-r--r--test/CodeGen/X86/pr32345.ll2
-rw-r--r--test/CodeGen/X86/pr32451.ll2
-rw-r--r--test/CodeGen/X86/pr34088.ll1
-rw-r--r--test/CodeGen/X86/pr34653.ll210
-rw-r--r--test/CodeGen/X86/pr34657.ll20
-rw-r--r--test/CodeGen/X86/pr9743.ll1
-rw-r--r--test/CodeGen/X86/push-cfi-debug.ll4
-rw-r--r--test/CodeGen/X86/push-cfi-obj.ll7
-rw-r--r--test/CodeGen/X86/push-cfi.ll3
-rw-r--r--test/CodeGen/X86/recip-fastmath.ll16
-rw-r--r--test/CodeGen/X86/recip-fastmath2.ll32
-rw-r--r--test/CodeGen/X86/return-ext.ll3
-rw-r--r--test/CodeGen/X86/rtm.ll1
-rw-r--r--test/CodeGen/X86/schedule-x86_32.ll348
-rw-r--r--test/CodeGen/X86/schedule-x86_64.ll737
-rw-r--r--test/CodeGen/X86/select-mmx.ll2
-rw-r--r--test/CodeGen/X86/select.ll38
-rw-r--r--test/CodeGen/X86/setcc-lowering.ll8
-rw-r--r--test/CodeGen/X86/shrink_vmul.ll13
-rw-r--r--test/CodeGen/X86/sse-intrinsics-x86.ll52
-rw-r--r--test/CodeGen/X86/sse-schedule.ll8
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-x86.ll28
-rw-r--r--test/CodeGen/X86/statepoint-call-lowering.ll1
-rw-r--r--test/CodeGen/X86/statepoint-gctransition-call-lowering.ll1
-rw-r--r--test/CodeGen/X86/statepoint-invoke.ll3
-rw-r--r--test/CodeGen/X86/throws-cfi-fp.ll98
-rw-r--r--test/CodeGen/X86/throws-cfi-no-fp.ll97
-rw-r--r--test/CodeGen/X86/var-permute-128.ll199
-rw-r--r--test/CodeGen/X86/var-permute-256.ll1020
-rw-r--r--test/CodeGen/X86/var-permute-512.ll618
-rw-r--r--test/CodeGen/X86/vec_fp_to_int.ll74
-rw-r--r--test/CodeGen/X86/vector-half-conversions.ll2620
-rw-r--r--test/CodeGen/X86/vector-sext.ll13
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v16.ll18
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v4.ll8
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v8.ll45
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v8.ll21
-rw-r--r--test/CodeGen/X86/vector-shuffle-avx512.ll2
-rw-r--r--test/CodeGen/X86/vector-shuffle-v1.ll2
-rw-r--r--test/CodeGen/X86/vector-trunc.ll73
-rw-r--r--test/CodeGen/X86/wide-integer-cmp.ll3
-rw-r--r--test/CodeGen/X86/x86-framelowering-trap.ll1
-rw-r--r--test/CodeGen/X86/x86-interleaved-access.ll1
-rw-r--r--test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll1
-rw-r--r--test/DebugInfo/AArch64/inlined-argument.ll140
-rw-r--r--test/DebugInfo/ARM/illegal-fragment.ll95
-rw-r--r--test/DebugInfo/ARM/salvage-debug-info.ll118
-rw-r--r--test/DebugInfo/Generic/location-verifier.ll2
-rw-r--r--test/DebugInfo/Generic/missing-abstract-variable.ll5
-rw-r--r--test/DebugInfo/Inputs/dwarfdump-header.elf-x86-64bin3056 -> 0 bytes
-rw-r--r--test/DebugInfo/X86/dwarfdump-header-64.s149
-rw-r--r--test/DebugInfo/X86/dwarfdump-header.s (renamed from test/DebugInfo/Inputs/dwarfdump-header.s)56
-rw-r--r--test/DebugInfo/X86/live-debug-variables.ll5
-rw-r--r--test/DebugInfo/dwarfdump-header.test60
-rw-r--r--test/FileCheck/defines.txt9
-rw-r--r--test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll6
-rw-r--r--test/LTO/Resolution/X86/comdat-mixed-lto.ll2
-rw-r--r--test/LTO/Resolution/X86/comdat.ll4
-rw-r--r--test/LTO/Resolution/X86/commons.ll2
-rw-r--r--test/MC/AArch64/SVE/assembler_tests/add.s66
-rw-r--r--test/MC/AArch64/SVE/assembler_tests/sub.s66
-rw-r--r--test/MC/AArch64/SVE/disassembler_tests/add.s50
-rw-r--r--test/MC/AArch64/SVE/disassembler_tests/sub.s50
-rw-r--r--test/MC/Disassembler/Mips/micromips32r3/valid-el.txt1
-rw-r--r--test/MC/Disassembler/Mips/micromips32r3/valid.txt1
-rw-r--r--test/MC/Disassembler/Mips/micromips32r6/valid.txt2
-rw-r--r--test/MC/Disassembler/Mips/micromips64r6/valid.txt2
-rw-r--r--test/MC/Disassembler/X86/prefixes-i386.txt78
-rw-r--r--test/MC/Disassembler/X86/prefixes-x86_64.txt24
-rw-r--r--test/MC/Disassembler/X86/prefixes.txt66
-rw-r--r--test/MC/Disassembler/X86/simple-tests.txt9
-rw-r--r--test/MC/Mips/micromips32r6/valid.s2
-rw-r--r--test/MC/Mips/micromips64r6/valid.s2
-rw-r--r--test/MC/Mips/tls-symbols.s28
-rw-r--r--test/Object/Inputs/trivial-object-test.coff-arm64bin0 -> 318 bytes
-rw-r--r--test/Object/Inputs/trivial-object-test.coff-armntbin0 -> 314 bytes
-rw-r--r--test/Object/archive-SYM64-write.test38
-rw-r--r--test/Object/obj2yaml.test158
-rw-r--r--test/Other/new-pm-defaults.ll1
-rw-r--r--test/Other/new-pm-lto-defaults.ll9
-rw-r--r--test/Other/new-pm-thinlto-defaults.ll1
-rw-r--r--test/ThinLTO/X86/deadstrip.ll30
-rw-r--r--test/ThinLTO/X86/funcimport2.ll4
-rw-r--r--test/ThinLTO/X86/internalize.ll9
-rw-r--r--test/ThinLTO/X86/lazyload_metadata.ll4
-rw-r--r--test/ThinLTO/X86/reference_non_importable.ll2
-rw-r--r--test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll339
-rw-r--r--test/Transforms/CallSiteSplitting/callsite-split.ll119
-rw-r--r--test/Transforms/CodeExtractor/PartialInlineNoInline.ll45
-rw-r--r--test/Transforms/CodeGenPrepare/ARM/sink-addrmode.ll18
-rw-r--r--test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll475
-rw-r--r--test/Transforms/ExpandMemCmp/X86/lit.local.cfg (renamed from test/LibDriver/lit.local.cfg)0
-rw-r--r--test/Transforms/ExpandMemCmp/X86/memcmp.ll (renamed from test/Transforms/CodeGenPrepare/X86/memcmp.ll)519
-rw-r--r--test/Transforms/IRCE/add-metadata-pre-post-loops.ll2
-rw-r--r--test/Transforms/IndVarSimplify/scev-phi-debug-info.ll71
-rw-r--r--test/Transforms/InstCombine/debuginfo_add.ll108
-rw-r--r--test/Transforms/InstCombine/shift.ll260
-rw-r--r--test/Transforms/LICM/sinking.ll284
-rw-r--r--test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll46
-rw-r--r--test/Transforms/LoopPredication/widened.ll138
-rw-r--r--test/Transforms/LoopVectorize/pr34681.ll122
-rw-r--r--test/Transforms/LoopVectorize/version-mem-access.ll5
-rw-r--r--test/Transforms/LowerTypeTests/blockaddress.ll27
-rw-r--r--test/Transforms/LowerTypeTests/import-unsat.ll1
-rw-r--r--test/Transforms/PGOProfile/Inputs/irreducible.proftext29
-rw-r--r--test/Transforms/PGOProfile/irreducible.ll184
-rw-r--r--test/Transforms/PGOProfile/thinlto_samplepgo_icp2.ll2
-rw-r--r--test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll48
-rw-r--r--test/Transforms/SLPVectorizer/X86/call.ll245
-rw-r--r--test/Transforms/SLPVectorizer/X86/cast.ll51
-rw-r--r--test/Transforms/SLPVectorizer/X86/load-merge.ll50
-rw-r--r--test/Transforms/SLPVectorizer/X86/stores_vectorize.ll84
-rw-r--r--test/Transforms/SampleProfile/indirect-call.ll2
-rw-r--r--test/Transforms/SimplifyCFG/merge-cond-stores-2.ll2
-rw-r--r--test/Transforms/WholeProgramDevirt/import-indir.ll1
-rw-r--r--test/lit.cfg.py4
-rw-r--r--test/lit.site.cfg.py.in3
-rw-r--r--test/tools/dsymutil/cmdline.test2
-rw-r--r--test/tools/gold/X86/asm_undefined2.ll3
-rw-r--r--test/tools/gold/X86/coff.ll2
-rw-r--r--test/tools/gold/X86/common.ll2
-rw-r--r--test/tools/gold/X86/emit-llvm.ll6
-rw-r--r--test/tools/gold/X86/global_with_section.ll16
-rw-r--r--test/tools/gold/X86/parallel.ll8
-rw-r--r--test/tools/gold/X86/thinlto_linkonceresolution.ll2
-rw-r--r--test/tools/gold/X86/thinlto_weak_library.ll2
-rw-r--r--test/tools/gold/X86/visibility.ll2
-rw-r--r--test/tools/llvm-ar/default-add.test3
-rw-r--r--test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s195
-rw-r--r--test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s380
-rw-r--r--test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s159
-rw-r--r--test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s87
-rw-r--r--test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s17
-rw-r--r--test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s17
-rw-r--r--test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s17
-rw-r--r--test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s5
-rw-r--r--test/tools/llvm-cfi-verify/X86/protected-lineinfo.s204
-rw-r--r--test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s168
-rw-r--r--test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s91
-rw-r--r--test/tools/llvm-lib/Inputs/a.s (renamed from test/LibDriver/Inputs/a.s)0
-rw-r--r--test/tools/llvm-lib/Inputs/b.s (renamed from test/LibDriver/Inputs/b.s)0
-rwxr-xr-xtest/tools/llvm-lib/Inputs/cl-gl.obj (renamed from test/LibDriver/Inputs/cl-gl.obj)bin3734 -> 3734 bytes
-rw-r--r--test/tools/llvm-lib/Inputs/resource.res (renamed from test/LibDriver/Inputs/resource.res)bin108 -> 108 bytes
-rw-r--r--test/tools/llvm-lib/infer-output-path.test (renamed from test/LibDriver/infer-output-path.test)0
-rw-r--r--test/tools/llvm-lib/invalid.test (renamed from test/LibDriver/invalid.test)0
-rw-r--r--test/tools/llvm-lib/libpath.test (renamed from test/LibDriver/libpath.test)0
-rw-r--r--test/tools/llvm-lib/lit.local.cfg3
-rw-r--r--test/tools/llvm-lib/no-inputs.test (renamed from test/LibDriver/no-inputs.test)0
-rw-r--r--test/tools/llvm-lib/resource.test (renamed from test/LibDriver/resource.test)0
-rw-r--r--test/tools/llvm-lib/thin.test (renamed from test/LibDriver/thin.test)0
-rw-r--r--test/tools/llvm-lib/use-paths.test (renamed from test/LibDriver/use-paths.test)0
-rw-r--r--test/tools/llvm-nm/X86/externalonly.test1
-rw-r--r--test/tools/llvm-nm/X86/importlibrary.test2
-rw-r--r--test/tools/llvm-objcopy/Inputs/dwarf.dwobin0 -> 3568 bytes
-rw-r--r--test/tools/llvm-objcopy/check-addr-offset-align-binary.test40
-rw-r--r--test/tools/llvm-objcopy/check-addr-offset-align.test67
-rw-r--r--test/tools/llvm-objcopy/drawf-fission.test43
-rw-r--r--test/tools/llvm-objdump/X86/Inputs/macho-invalid-reloc-section-indexbin0 -> 2768 bytes
-rw-r--r--test/tools/llvm-objdump/X86/malformed-machos.test3
314 files changed, 22575 insertions, 6449 deletions
diff --git a/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll b/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll
new file mode 100644
index 00000000000..0a580276d95
--- /dev/null
+++ b/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll
@@ -0,0 +1,208 @@
+; RUN: opt < %s -analyze -block-freq | FileCheck %s
+; RUN: opt < %s -passes='print<block-freq>' -disable-output 2>&1 | FileCheck %s
+
+; Function Attrs: noinline norecurse nounwind readnone uwtable
+define i32 @_Z11irreducibleii(i32 %iter_outer, i32 %iter_inner) local_unnamed_addr !prof !27 {
+entry:
+ %cmp24 = icmp sgt i32 %iter_outer, 0
+ br i1 %cmp24, label %for.body, label %entry.for.cond.cleanup_crit_edge, !prof !28
+
+entry.for.cond.cleanup_crit_edge: ; preds = %entry
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.end, %entry.for.cond.cleanup_crit_edge
+ %sum.0.lcssa = phi i32 [ 0, %entry.for.cond.cleanup_crit_edge ], [ %sum.1, %for.end ]
+ ret i32 %sum.0.lcssa
+
+for.body: ; preds = %for.end, %entry
+ %k.026 = phi i32 [ %inc12, %for.end ], [ 0, %entry ]
+ %sum.025 = phi i32 [ %sum.1, %for.end ], [ 0, %entry ]
+ %rem23 = and i32 %k.026, 1
+ %cmp1 = icmp eq i32 %rem23, 0
+ br i1 %cmp1, label %entry8, label %for.cond2, !prof !29
+
+for.cond2: ; preds = %if.end9, %for.body
+ %sum.1 = phi i32 [ %add10, %if.end9 ], [ %sum.025, %for.body ]
+ %i.0 = phi i32 [ %inc, %if.end9 ], [ 0, %for.body ]
+ %cmp3 = icmp slt i32 %i.0, %iter_inner
+ br i1 %cmp3, label %for.body4, label %for.end, !prof !30, !irr_loop !31
+
+for.body4: ; preds = %for.cond2
+ %rem5 = srem i32 %k.026, 3
+ %cmp6 = icmp eq i32 %rem5, 0
+ br i1 %cmp6, label %entry8, label %if.end9, !prof !32
+
+entry8: ; preds = %for.body4, %for.body
+ %sum.2 = phi i32 [ %sum.025, %for.body ], [ %sum.1, %for.body4 ]
+ %i.1 = phi i32 [ 0, %for.body ], [ %i.0, %for.body4 ]
+ %add = add nsw i32 %sum.2, 4
+ br label %if.end9, !irr_loop !33
+
+if.end9: ; preds = %entry8, %for.body4
+ %sum.3 = phi i32 [ %add, %entry8 ], [ %sum.1, %for.body4 ]
+ %i.2 = phi i32 [ %i.1, %entry8 ], [ %i.0, %for.body4 ]
+ %add10 = add nsw i32 %sum.3, 1
+ %inc = add nsw i32 %i.2, 1
+ br label %for.cond2, !irr_loop !34
+
+for.end: ; preds = %for.cond2
+ %inc12 = add nuw nsw i32 %k.026, 1
+ %exitcond = icmp eq i32 %inc12, %iter_outer
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !35
+}
+
+!27 = !{!"function_entry_count", i64 1}
+!28 = !{!"branch_weights", i32 1, i32 0}
+!29 = !{!"branch_weights", i32 50, i32 50}
+!30 = !{!"branch_weights", i32 950, i32 100}
+!31 = !{!"loop_header_weight", i64 1050}
+!32 = !{!"branch_weights", i32 323, i32 627}
+!33 = !{!"loop_header_weight", i64 373}
+!34 = !{!"loop_header_weight", i64 1000}
+!35 = !{!"branch_weights", i32 1, i32 99}
+
+; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreducibleii':
+; CHECK-NEXT: block-frequency-info: _Z11irreducibleii
+; CHECK-NEXT: - entry: {{.*}} count = 1
+; CHECK-NEXT: - entry.for.cond.cleanup_crit_edge: {{.*}} count = 0
+; CHECK-NEXT: - for.cond.cleanup: {{.*}} count = 1
+; CHECK-NEXT: - for.body: {{.*}} count = 100
+; CHECK-NEXT: - for.cond2: {{.*}} count = 1050, irr_loop_header_weight = 1050
+; CHECK-NEXT: - for.body4: {{.*}} count = 950
+; CHECK-NEXT: - entry8: {{.*}} count = 373, irr_loop_header_weight = 373
+; CHECK-NEXT: - if.end9: {{.*}} count = 1000, irr_loop_header_weight = 1000
+; CHECK-NEXT: - for.end: {{.*}} count = 100
+
+@targets = local_unnamed_addr global [256 x i8*] zeroinitializer, align 16
+@tracing = local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: noinline norecurse nounwind uwtable
+define i32 @_Z11irreduciblePh(i8* nocapture readonly %p) !prof !27 {
+entry:
+ store <2 x i8*> <i8* blockaddress(@_Z11irreduciblePh, %sw.bb), i8* blockaddress(@_Z11irreduciblePh, %TARGET_1)>, <2 x i8*>* bitcast ([256 x i8*]* @targets to <2 x i8*>*), align 16
+ store i8* blockaddress(@_Z11irreduciblePh, %TARGET_2), i8** getelementptr inbounds ([256 x i8*], [256 x i8*]* @targets, i64 0, i64 2), align 16
+ %0 = load i32, i32* @tracing, align 4
+ %tobool = icmp eq i32 %0, 0
+ br label %for.cond1
+
+for.cond1: ; preds = %sw.default, %entry
+ %p.addr.0 = phi i8* [ %p, %entry ], [ %p.addr.4, %sw.default ]
+ %sum.0 = phi i32 [ 0, %entry ], [ %add25, %sw.default ]
+ %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1
+ %1 = load i8, i8* %p.addr.0, align 1
+ %incdec.ptr2 = getelementptr inbounds i8, i8* %p.addr.0, i64 2
+ %2 = load i8, i8* %incdec.ptr, align 1
+ %conv3 = zext i8 %2 to i32
+ br label %dispatch_op
+
+dispatch_op: ; preds = %sw.bb6, %for.cond1
+ %p.addr.1 = phi i8* [ %incdec.ptr2, %for.cond1 ], [ %p.addr.2, %sw.bb6 ]
+ %op.0 = phi i8 [ %1, %for.cond1 ], [ 1, %sw.bb6 ]
+ %oparg.0 = phi i32 [ %conv3, %for.cond1 ], [ %oparg.2, %sw.bb6 ]
+ %sum.1 = phi i32 [ %sum.0, %for.cond1 ], [ %add7, %sw.bb6 ]
+ switch i8 %op.0, label %sw.default [
+ i8 0, label %sw.bb
+ i8 1, label %dispatch_op.sw.bb6_crit_edge
+ i8 2, label %sw.bb15
+ ], !prof !36
+
+dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op
+ br label %sw.bb6
+
+sw.bb: ; preds = %indirectgoto, %dispatch_op
+ %oparg.1 = phi i32 [ %oparg.0, %dispatch_op ], [ 0, %indirectgoto ]
+ %sum.2 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %indirectgoto ]
+ %add.neg = sub i32 -5, %oparg.1
+ %sub = add i32 %add.neg, %sum.2
+ br label %exit
+
+TARGET_1: ; preds = %indirectgoto
+ %incdec.ptr4 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
+ %3 = load i8, i8* %p.addr.5, align 1
+ %conv5 = zext i8 %3 to i32
+ br label %sw.bb6
+
+sw.bb6: ; preds = %TARGET_1, %dispatch_op.sw.bb6_crit_edge
+ %p.addr.2 = phi i8* [ %incdec.ptr4, %TARGET_1 ], [ %p.addr.1, %dispatch_op.sw.bb6_crit_edge ]
+ %oparg.2 = phi i32 [ %conv5, %TARGET_1 ], [ %oparg.0, %dispatch_op.sw.bb6_crit_edge ]
+ %sum.3 = phi i32 [ %sum.7, %TARGET_1 ], [ %sum.1, %dispatch_op.sw.bb6_crit_edge ]
+ %mul = mul nsw i32 %oparg.2, 7
+ %add7 = add nsw i32 %sum.3, %mul
+ %rem46 = and i32 %add7, 1
+ %cmp8 = icmp eq i32 %rem46, 0
+ br i1 %cmp8, label %dispatch_op, label %if.then, !prof !37, !irr_loop !38
+
+if.then: ; preds = %sw.bb6
+ %mul9 = mul nsw i32 %add7, 9
+ br label %indirectgoto
+
+TARGET_2: ; preds = %indirectgoto
+ %incdec.ptr13 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
+ %4 = load i8, i8* %p.addr.5, align 1
+ %conv14 = zext i8 %4 to i32
+ br label %sw.bb15
+
+sw.bb15: ; preds = %TARGET_2, %dispatch_op
+ %p.addr.3 = phi i8* [ %p.addr.1, %dispatch_op ], [ %incdec.ptr13, %TARGET_2 ]
+ %oparg.3 = phi i32 [ %oparg.0, %dispatch_op ], [ %conv14, %TARGET_2 ]
+ %sum.4 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %TARGET_2 ]
+ %add16 = add nsw i32 %oparg.3, 3
+ %add17 = add nsw i32 %add16, %sum.4
+ br i1 %tobool, label %if.then18, label %exit, !prof !39, !irr_loop !40
+
+if.then18: ; preds = %sw.bb15
+ %idx.ext = sext i32 %oparg.3 to i64
+ %add.ptr = getelementptr inbounds i8, i8* %p.addr.3, i64 %idx.ext
+ %mul19 = mul nsw i32 %add17, 17
+ br label %indirectgoto
+
+unknown_op: ; preds = %indirectgoto
+ %sub24 = add nsw i32 %sum.7, -4
+ br label %sw.default
+
+sw.default: ; preds = %unknown_op, %dispatch_op
+ %p.addr.4 = phi i8* [ %p.addr.5, %unknown_op ], [ %p.addr.1, %dispatch_op ]
+ %sum.5 = phi i32 [ %sub24, %unknown_op ], [ %sum.1, %dispatch_op ]
+ %add25 = add nsw i32 %sum.5, 11
+ br label %for.cond1
+
+exit: ; preds = %sw.bb15, %sw.bb
+ %sum.6 = phi i32 [ %sub, %sw.bb ], [ %add17, %sw.bb15 ]
+ ret i32 %sum.6
+
+indirectgoto: ; preds = %if.then18, %if.then
+ %add.ptr.pn = phi i8* [ %add.ptr, %if.then18 ], [ %p.addr.2, %if.then ]
+ %sum.7 = phi i32 [ %mul19, %if.then18 ], [ %mul9, %if.then ]
+ %p.addr.5 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 1
+ %5 = load i8, i8* %add.ptr.pn, align 1
+ %idxprom21 = zext i8 %5 to i64
+ %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21
+ %6 = load i8*, i8** %arrayidx22, align 8
+ indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !41, !irr_loop !42
+}
+
+!36 = !{!"branch_weights", i32 0, i32 0, i32 201, i32 1}
+!37 = !{!"branch_weights", i32 201, i32 300}
+!38 = !{!"loop_header_weight", i64 501}
+!39 = !{!"branch_weights", i32 100, i32 0}
+!40 = !{!"loop_header_weight", i64 100}
+!41 = !{!"branch_weights", i32 0, i32 1, i32 300, i32 99}
+!42 = !{!"loop_header_weight", i64 400}
+
+; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreduciblePh':
+; CHECK-NEXT: block-frequency-info: _Z11irreduciblePh
+; CHECK-NEXT: - entry: {{.*}} count = 1
+; CHECK-NEXT: - for.cond1: {{.*}} count = 1
+; CHECK-NEXT: - dispatch_op: {{.*}} count = 201
+; CHECK-NEXT: - dispatch_op.sw.bb6_crit_edge: {{.*}} count = 200
+; CHECK-NEXT: - sw.bb: {{.*}} count = 0
+; CHECK-NEXT: - TARGET_1: {{.*}} count = 299
+; CHECK-NEXT: - sw.bb6: {{.*}} count = 500, irr_loop_header_weight = 501
+; CHECK-NEXT: - if.then: {{.*}} count = 299
+; CHECK-NEXT: - TARGET_2: {{.*}} count = 98
+; CHECK-NEXT: - sw.bb15: {{.*}} count = 99, irr_loop_header_weight = 100
+; CHECK-NEXT: - if.then18: {{.*}} count = 99
+; CHECK-NEXT: - unknown_op: {{.*}} count = 0
+; CHECK-NEXT: - sw.default: {{.*}} count = 0
+; CHECK-NEXT: - exit: {{.*}} count = 1
+; CHECK-NEXT: - indirectgoto: {{.*}} count = 399, irr_loop_header_weight = 400
diff --git a/test/Analysis/CostModel/X86/interleaved-load-float.ll b/test/Analysis/CostModel/X86/interleaved-load-float.ll
new file mode 100644
index 00000000000..373a55d7ad4
--- /dev/null
+++ b/test/Analysis/CostModel/X86/interleaved-load-float.ll
@@ -0,0 +1,141 @@
+; REQUIRES: asserts
+; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake %s 2>&1 | FileCheck %s
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+@src = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
+@dst = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
+
+; Function Attrs: norecurse nounwind
+define void @stride8(float %k, i32 %width_) {
+entry:
+
+; CHECK: Found an estimated cost of 48 for VF 8 For instruction: %0 = load float
+
+ %cmp72 = icmp sgt i32 %width_, 0
+ br i1 %cmp72, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+
+for.body: ; preds = %for.body.lr.ph, %for.body
+ %i.073 = phi i32 [ 0, %for.body.lr.ph ], [ %add46, %for.body ]
+ %arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.073
+ %0 = load float, float* %arrayidx, align 4
+ %mul = fmul fast float %0, %k
+ %arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.073
+ %1 = load float, float* %arrayidx2, align 4
+ %add3 = fadd fast float %1, %mul
+ store float %add3, float* %arrayidx2, align 4
+ %add4 = or i32 %i.073, 1
+ %arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
+ %2 = load float, float* %arrayidx5, align 4
+ %mul6 = fmul fast float %2, %k
+ %arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
+ %3 = load float, float* %arrayidx8, align 4
+ %add9 = fadd fast float %3, %mul6
+ store float %add9, float* %arrayidx8, align 4
+ %add10 = or i32 %i.073, 2
+ %arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
+ %4 = load float, float* %arrayidx11, align 4
+ %mul12 = fmul fast float %4, %k
+ %arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
+ %5 = load float, float* %arrayidx14, align 4
+ %add15 = fadd fast float %5, %mul12
+ store float %add15, float* %arrayidx14, align 4
+ %add16 = or i32 %i.073, 3
+ %arrayidx17 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add16
+ %6 = load float, float* %arrayidx17, align 4
+ %mul18 = fmul fast float %6, %k
+ %arrayidx20 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add16
+ %7 = load float, float* %arrayidx20, align 4
+ %add21 = fadd fast float %7, %mul18
+ store float %add21, float* %arrayidx20, align 4
+ %add22 = or i32 %i.073, 4
+ %arrayidx23 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add22
+ %8 = load float, float* %arrayidx23, align 4
+ %mul24 = fmul fast float %8, %k
+ %arrayidx26 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add22
+ %9 = load float, float* %arrayidx26, align 4
+ %add27 = fadd fast float %9, %mul24
+ store float %add27, float* %arrayidx26, align 4
+ %add28 = or i32 %i.073, 5
+ %arrayidx29 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add28
+ %10 = load float, float* %arrayidx29, align 4
+ %mul30 = fmul fast float %10, %k
+ %arrayidx32 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add28
+ %11 = load float, float* %arrayidx32, align 4
+ %add33 = fadd fast float %11, %mul30
+ store float %add33, float* %arrayidx32, align 4
+ %add34 = or i32 %i.073, 6
+ %arrayidx35 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add34
+ %12 = load float, float* %arrayidx35, align 4
+ %mul36 = fmul fast float %12, %k
+ %arrayidx38 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add34
+ %13 = load float, float* %arrayidx38, align 4
+ %add39 = fadd fast float %13, %mul36
+ store float %add39, float* %arrayidx38, align 4
+ %add40 = or i32 %i.073, 7
+ %arrayidx41 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add40
+ %14 = load float, float* %arrayidx41, align 4
+ %mul42 = fmul fast float %14, %k
+ %arrayidx44 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add40
+ %15 = load float, float* %arrayidx44, align 4
+ %add45 = fadd fast float %15, %mul42
+ store float %add45, float* %arrayidx44, align 4
+ %add46 = add nuw nsw i32 %i.073, 8
+ %cmp = icmp slt i32 %add46, %width_
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+; Function Attrs: norecurse nounwind
+define void @stride3(float %k, i32 %width_) {
+entry:
+
+; CHECK: Found an estimated cost of 20 for VF 8 For instruction: %0 = load float
+
+ %cmp27 = icmp sgt i32 %width_, 0
+ br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body.lr.ph, %for.body
+ %i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %add16, %for.body ]
+ %arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.028
+ %0 = load float, float* %arrayidx, align 4
+ %mul = fmul fast float %0, %k
+ %arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.028
+ %1 = load float, float* %arrayidx2, align 4
+ %add3 = fadd fast float %1, %mul
+ store float %add3, float* %arrayidx2, align 4
+ %add4 = add nuw nsw i32 %i.028, 1
+ %arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
+ %2 = load float, float* %arrayidx5, align 4
+ %mul6 = fmul fast float %2, %k
+ %arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
+ %3 = load float, float* %arrayidx8, align 4
+ %add9 = fadd fast float %3, %mul6
+ store float %add9, float* %arrayidx8, align 4
+ %add10 = add nuw nsw i32 %i.028, 2
+ %arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
+ %4 = load float, float* %arrayidx11, align 4
+ %mul12 = fmul fast float %4, %k
+ %arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
+ %5 = load float, float* %arrayidx14, align 4
+ %add15 = fadd fast float %5, %mul12
+ store float %add15, float* %arrayidx14, align 4
+ %add16 = add nuw nsw i32 %i.028, 3
+ %cmp = icmp slt i32 %add16, %width_
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
diff --git a/test/Assembler/fast-math-flags.ll b/test/Assembler/fast-math-flags.ll
index 4ef3607e1d0..664b1bd271e 100644
--- a/test/Assembler/fast-math-flags.ll
+++ b/test/Assembler/fast-math-flags.ll
@@ -7,6 +7,8 @@
@vec = external global <3 x float>
@arr = external global [3 x float]
+declare float @foo(float)
+
define float @none(float %x, float %y) {
entry:
; CHECK: %vec = load <3 x float>, <3 x float>* @vec
@@ -86,6 +88,28 @@ entry:
ret float %c
}
+; CHECK: @reassoc(
+define float @reassoc(float %x, float %y) {
+; CHECK: %a = fsub reassoc float %x, %y
+ %a = fsub reassoc float %x, %y
+; CHECK: %b = fmul reassoc float %x, %y
+ %b = fmul reassoc float %x, %y
+; CHECK: %c = call reassoc float @foo(float %b)
+ %c = call reassoc float @foo(float %b)
+ ret float %c
+}
+
+; CHECK: @afn(
+define float @afn(float %x, float %y) {
+; CHECK: %a = fdiv afn float %x, %y
+ %a = fdiv afn float %x, %y
+; CHECK: %b = frem afn float %x, %y
+ %b = frem afn float %x, %y
+; CHECK: %c = call afn float @foo(float %b)
+ %c = call afn float @foo(float %b)
+ ret float %c
+}
+
; CHECK: no_nan_inf
define float @no_nan_inf(float %x, float %y) {
entry:
@@ -130,10 +154,10 @@ entry:
; CHECK: %arr = load [3 x float], [3 x float]* @arr
%arr = load [3 x float], [3 x float]* @arr
-; CHECK: %a = fadd nnan ninf float %x, %y
- %a = fadd ninf nnan float %x, %y
-; CHECK: %a_vec = fadd nnan <3 x float> %vec, %vec
- %a_vec = fadd nnan <3 x float> %vec, %vec
+; CHECK: %a = fadd nnan ninf afn float %x, %y
+ %a = fadd ninf nnan afn float %x, %y
+; CHECK: %a_vec = fadd reassoc nnan <3 x float> %vec, %vec
+ %a_vec = fadd reassoc nnan <3 x float> %vec, %vec
; CHECK: %b = fsub fast float %x, %y
%b = fsub nnan nsz fast float %x, %y
; CHECK: %b_vec = fsub nnan <3 x float> %vec, %vec
diff --git a/test/Bitcode/compatibility-3.6.ll b/test/Bitcode/compatibility-3.6.ll
index e9313dfba87..6c47a853e24 100644
--- a/test/Bitcode/compatibility-3.6.ll
+++ b/test/Bitcode/compatibility-3.6.ll
@@ -612,7 +612,9 @@ define void @fastmathflags(float %op1, float %op2) {
%f.arcp = fadd arcp float %op1, %op2
; CHECK: %f.arcp = fadd arcp float %op1, %op2
%f.fast = fadd fast float %op1, %op2
- ; CHECK: %f.fast = fadd fast float %op1, %op2
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
+ ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2
ret void
}
diff --git a/test/Bitcode/compatibility-3.7.ll b/test/Bitcode/compatibility-3.7.ll
index 82fc9905535..55844e5c498 100644
--- a/test/Bitcode/compatibility-3.7.ll
+++ b/test/Bitcode/compatibility-3.7.ll
@@ -656,7 +656,9 @@ define void @fastmathflags(float %op1, float %op2) {
%f.arcp = fadd arcp float %op1, %op2
; CHECK: %f.arcp = fadd arcp float %op1, %op2
%f.fast = fadd fast float %op1, %op2
- ; CHECK: %f.fast = fadd fast float %op1, %op2
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
+ ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2
ret void
}
diff --git a/test/Bitcode/compatibility-3.8.ll b/test/Bitcode/compatibility-3.8.ll
index 2e70a380d10..a7fa20f2bc0 100644
--- a/test/Bitcode/compatibility-3.8.ll
+++ b/test/Bitcode/compatibility-3.8.ll
@@ -687,7 +687,9 @@ define void @fastmathflags(float %op1, float %op2) {
%f.arcp = fadd arcp float %op1, %op2
; CHECK: %f.arcp = fadd arcp float %op1, %op2
%f.fast = fadd fast float %op1, %op2
- ; CHECK: %f.fast = fadd fast float %op1, %op2
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
+ ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2
ret void
}
@@ -700,7 +702,9 @@ declare <4 x double> @fmf3()
; CHECK-LABEL: fastMathFlagsForCalls(
define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) {
%call.fast = call fast float @fmf1()
- ; CHECK: %call.fast = call fast float @fmf1()
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'aml' bits set, so this is not fully 'fast'.
+ ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp float @fmf1()
; Throw in some other attributes to make sure those stay in the right places.
diff --git a/test/Bitcode/compatibility-3.9.ll b/test/Bitcode/compatibility-3.9.ll
index 7c84daa7d3c..c456fefe9d4 100644
--- a/test/Bitcode/compatibility-3.9.ll
+++ b/test/Bitcode/compatibility-3.9.ll
@@ -758,7 +758,9 @@ define void @fastmathflags(float %op1, float %op2) {
%f.arcp = fadd arcp float %op1, %op2
; CHECK: %f.arcp = fadd arcp float %op1, %op2
%f.fast = fadd fast float %op1, %op2
- ; CHECK: %f.fast = fadd fast float %op1, %op2
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
+ ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2
ret void
}
@@ -771,7 +773,9 @@ declare <4 x double> @fmf3()
; CHECK-LABEL: fastMathFlagsForCalls(
define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) {
%call.fast = call fast float @fmf1()
- ; CHECK: %call.fast = call fast float @fmf1()
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
+ ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp float @fmf1()
; Throw in some other attributes to make sure those stay in the right places.
diff --git a/test/Bitcode/compatibility-4.0.ll b/test/Bitcode/compatibility-4.0.ll
index 9e34d48c95f..68446a7d5b0 100644
--- a/test/Bitcode/compatibility-4.0.ll
+++ b/test/Bitcode/compatibility-4.0.ll
@@ -757,8 +757,10 @@ define void @fastmathflags(float %op1, float %op2) {
; CHECK: %f.nsz = fadd nsz float %op1, %op2
%f.arcp = fadd arcp float %op1, %op2
; CHECK: %f.arcp = fadd arcp float %op1, %op2
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
%f.fast = fadd fast float %op1, %op2
- ; CHECK: %f.fast = fadd fast float %op1, %op2
+ ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2
ret void
}
@@ -771,7 +773,9 @@ declare <4 x double> @fmf3()
; CHECK-LABEL: fastMathFlagsForCalls(
define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) {
%call.fast = call fast float @fmf1()
- ; CHECK: %call.fast = call fast float @fmf1()
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
+ ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp float @fmf1()
; Throw in some other attributes to make sure those stay in the right places.
diff --git a/test/Bitcode/compatibility-5.0.ll b/test/Bitcode/compatibility-5.0.ll
index a4b3fca82b7..cdadc032d87 100644
--- a/test/Bitcode/compatibility-5.0.ll
+++ b/test/Bitcode/compatibility-5.0.ll
@@ -765,7 +765,9 @@ define void @fastmathflags(float %op1, float %op2) {
%f.contract = fadd contract float %op1, %op2
; CHECK: %f.contract = fadd contract float %op1, %op2
%f.fast = fadd fast float %op1, %op2
- ; CHECK: %f.fast = fadd fast float %op1, %op2
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'afn' bit set, so this is not fully 'fast'.
+ ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp contract float %op1, %op2
ret void
}
@@ -778,7 +780,9 @@ declare <4 x double> @fmf3()
; CHECK-LABEL: fastMathFlagsForCalls(
define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) {
%call.fast = call fast float @fmf1()
- ; CHECK: %call.fast = call fast float @fmf1()
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'afn' bit set, so this is not fully 'fast'.
+ ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp contract float @fmf1()
; Throw in some other attributes to make sure those stay in the right places.
diff --git a/test/Bitcode/compatibility.ll b/test/Bitcode/compatibility.ll
index 7d4167f4cb0..0157fd438a7 100644
--- a/test/Bitcode/compatibility.ll
+++ b/test/Bitcode/compatibility.ll
@@ -775,6 +775,10 @@ define void @fastmathflags(float %op1, float %op2) {
; CHECK: %f.arcp = fadd arcp float %op1, %op2
%f.contract = fadd contract float %op1, %op2
; CHECK: %f.contract = fadd contract float %op1, %op2
+ %f.afn = fadd afn float %op1, %op2
+ ; CHECK: %f.afn = fadd afn float %op1, %op2
+ %f.reassoc = fadd reassoc float %op1, %op2
+ ; CHECK: %f.reassoc = fadd reassoc float %op1, %op2
%f.fast = fadd fast float %op1, %op2
; CHECK: %f.fast = fadd fast float %op1, %op2
ret void
diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll b/test/Bitcode/thinlto-summary-local-5.0.ll
new file mode 100644
index 00000000000..cbc48d23df3
--- /dev/null
+++ b/test/Bitcode/thinlto-summary-local-5.0.ll
@@ -0,0 +1,22 @@
+; Bitcode compatibility test for dso_local flag in thin-lto summaries.
+; Checks that older bitcode summaries without the dso_local op are still
+; properly parsed and don't set GlobalValues as dso_local.
+
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; RUN: llvm-bcanalyzer -dump %s.bc | FileCheck %s --check-prefix=BCAN
+
+define void @foo() {
+;CHECK-DAG:define void @foo()
+ ret void
+}
+
+@bar = global i32 0
+;CHECK-DAG: @bar = global i32 0
+
+@baz = alias i32, i32* @bar
+;CHECK-DAG: @bar = global i32 0
+
+;BCAN: <SOURCE_FILENAME
+;BCAN-NEXT: <GLOBALVAR {{.*}} op7=0/>
+;BCAN-NEXT: <FUNCTION {{.*}} op16=0/>
+;BCAN-NEXT: <ALIAS {{.*}} op9=0/>
diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll.bc b/test/Bitcode/thinlto-summary-local-5.0.ll.bc
new file mode 100644
index 00000000000..8dc7ca0a74b
--- /dev/null
+++ b/test/Bitcode/thinlto-summary-local-5.0.ll.bc
Binary files differ
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
index 25c0e78a7b2..4a4c3c58072 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@@ -167,3 +167,70 @@ end:
%vec = load <2 x i16*>, <2 x i16*>* undef
br label %block
}
+
+; FALLBACK-WITH-REPORT-ERR-G_IMPLICIT_DEF-LEGALIZABLE: (FIXME: this is what is expected once we can legalize non-pow-of-2 G_IMPLICIT_DEF) remark: <unknown>:0:0: unable to legalize instruction: %vreg1<def>(s96) = G_INSERT %vreg2, %vreg0, 0; (in function: nonpow2_insertvalue_narrowing
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg2<def>(s96) = G_IMPLICIT_DEF; (in function: nonpow2_insertvalue_narrowing
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_insertvalue_narrowing
+; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_insertvalue_narrowing:
+%struct96 = type { float, float, float }
+define void @nonpow2_insertvalue_narrowing(float %a) {
+ %dummy = insertvalue %struct96 undef, float %a, 0
+ ret void
+}
+
+; FALLBACK-WITH-REPORT-ERR remark: <unknown>:0:0: unable to legalize instruction: %vreg3<def>(s96) = G_ADD %vreg2, %vreg2; (in function: nonpow2_add_narrowing
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_add_narrowing
+; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_add_narrowing:
+define void @nonpow2_add_narrowing() {
+ %a = add i128 undef, undef
+ %b = trunc i128 %a to i96
+ %dummy = add i96 %b, %b
+ ret void
+}
+
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg3<def>(s96) = G_OR %vreg2, %vreg2; (in function: nonpow2_or_narrowing
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_or_narrowing
+; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_or_narrowing:
+define void @nonpow2_or_narrowing() {
+ %a = add i128 undef, undef
+ %b = trunc i128 %a to i96
+ %dummy = or i96 %b, %b
+ ret void
+}
+
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg0<def>(s96) = G_LOAD %vreg1; mem:LD12[undef](align=16) (in function: nonpow2_load_narrowing
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_load_narrowing
+; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_load_narrowing:
+define void @nonpow2_load_narrowing() {
+ %dummy = load i96, i96* undef
+ ret void
+}
+
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: G_STORE %vreg3, %vreg0; mem:ST12[%c](align=16) (in function: nonpow2_store_narrowing
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_store_narrowing
+; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_store_narrowing:
+define void @nonpow2_store_narrowing(i96* %c) {
+ %a = add i128 undef, undef
+ %b = trunc i128 %a to i96
+ store i96 %b, i96* %c
+ ret void
+}
+
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg0<def>(s96) = G_CONSTANT 0; (in function: nonpow2_constant_narrowing
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_constant_narrowing
+; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_constant_narrowing:
+define void @nonpow2_constant_narrowing() {
+ store i96 0, i96* undef
+ ret void
+}
+
+; Currently can't handle vector lengths that aren't an exact multiple of
+; natively supported vector lengths. Test that the fall-back works for those.
+; FALLBACK-WITH-REPORT-ERR-G_IMPLICIT_DEF-LEGALIZABLE: (FIXME: this is what is expected once we can legalize non-pow-of-2 G_IMPLICIT_DEF) remark: <unknown>:0:0: unable to legalize instruction: %vreg1<def>(<7 x s64>) = G_ADD %vreg0, %vreg0; (in function: nonpow2_vector_add_fewerelements
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg0<def>(<7 x s64>) = G_IMPLICIT_DEF; (in function: nonpow2_vector_add_fewerelements
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_vector_add_fewerelements
+; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_vector_add_fewerelements:
+define void @nonpow2_vector_add_fewerelements() {
+ %dummy = add <7 x i64> undef, undef
+ ret void
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir
index 4042047dfc2..cc158a29c3e 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir
@@ -92,6 +92,10 @@
store double %vres, double* %addr
ret void
}
+
+ define void @fp16Ext32() { ret void }
+ define void @fp16Ext64() { ret void }
+ define void @fp32Ext64() { ret void }
...
---
@@ -742,3 +746,103 @@ body: |
RET_ReallyLR
...
+
+---
+# Make sure we map FPEXT on FPR register bank.
+# CHECK-LABEL: name: fp16Ext32
+name: fp16Ext32
+alignment: 2
+legalized: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 2, class: fpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 3, class: fpr, preferred-register: '' }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+# CHECK: %1:gpr(s32) = COPY %w0
+# CHECK-NEXT: %0:gpr(s16) = G_TRUNC %1
+# %0 has been mapped to GPR, we need to repair to match FPR.
+# CHECK-NEXT: %3:fpr(s16) = COPY %0
+# CHECK-NEXT: %2:fpr(s32) = G_FPEXT %3
+# CHECK-NEXT: %s0 = COPY %2
+# CHECK-NEXT: RET_ReallyLR
+
+body: |
+ bb.1:
+ liveins: %w0
+
+ %1(s32) = COPY %w0
+ %0(s16) = G_TRUNC %1(s32)
+ %2(s32) = G_FPEXT %0(s16)
+ %s0 = COPY %2(s32)
+ RET_ReallyLR implicit %s0
+
+...
+
+---
+# Make sure we map FPEXT on FPR register bank.
+# CHECK-LABEL: name: fp16Ext64
+name: fp16Ext64
+alignment: 2
+legalized: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 2, class: fpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 3, class: fpr, preferred-register: '' }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+# CHECK: %1:gpr(s32) = COPY %w0
+# CHECK-NEXT: %0:gpr(s16) = G_TRUNC %1
+# %0 has been mapped to GPR, we need to repair to match FPR.
+# CHECK-NEXT: %3:fpr(s16) = COPY %0
+# CHECK-NEXT: %2:fpr(s64) = G_FPEXT %3
+# CHECK-NEXT: %d0 = COPY %2
+# CHECK-NEXT: RET_ReallyLR
+
+body: |
+ bb.1:
+ liveins: %w0
+
+ %1(s32) = COPY %w0
+ %0(s16) = G_TRUNC %1(s32)
+ %2(s64) = G_FPEXT %0(s16)
+ %d0 = COPY %2(s64)
+ RET_ReallyLR implicit %d0
+
+...
+
+---
+# Make sure we map FPEXT on FPR register bank.
+# CHECK-LABEL: name: fp32Ext64
+name: fp32Ext64
+alignment: 2
+legalized: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: fpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 2, class: fpr, preferred-register: '' }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+# CHECK: %0:gpr(s32) = COPY %w0
+# %0 has been mapped to GPR, we need to repair to match FPR.
+# CHECK-NEXT: %2:fpr(s32) = COPY %0
+# CHECK-NEXT: %1:fpr(s64) = G_FPEXT %2
+# CHECK-NEXT: %d0 = COPY %1
+# CHECK-NEXT: RET_ReallyLR
+body: |
+ bb.1:
+ liveins: %w0
+
+ %0(s32) = COPY %w0
+ %1(s64) = G_FPEXT %0(s32)
+ %d0 = COPY %1(s64)
+ RET_ReallyLR implicit %d0
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
index fa6727da1bb..20449c53a59 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
@@ -8,6 +8,10 @@
entry:
ret void
}
+ define void @test_scalar_add_big_nonpow2() {
+ entry:
+ ret void
+ }
define void @test_scalar_add_small() {
entry:
ret void
@@ -16,6 +20,10 @@
entry:
ret void
}
+ define void @test_vector_add_nonpow2() {
+ entry:
+ ret void
+ }
...
---
@@ -58,6 +66,49 @@ body: |
...
---
+name: test_scalar_add_big_nonpow2
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+ - { id: 8, class: _ }
+ - { id: 9, class: _ }
+body: |
+ bb.0.entry:
+ liveins: %x0, %x1, %x2, %x3
+ ; CHECK-LABEL: name: test_scalar_add_big_nonpow2
+ ; CHECK-NOT: G_MERGE_VALUES
+ ; CHECK-NOT: G_UNMERGE_VALUES
+ ; CHECK-DAG: [[CARRY0_32:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-DAG: [[CARRY0:%[0-9]+]]:_(s1) = G_TRUNC [[CARRY0_32]]
+ ; CHECK: [[RES_LO:%[0-9]+]]:_(s64), [[CARRY1:%[0-9]+]]:_(s1) = G_UADDE %0, %1, [[CARRY0]]
+ ; CHECK: [[RES_MI:%[0-9]+]]:_(s64), [[CARRY2:%[0-9]+]]:_(s1) = G_UADDE %1, %2, [[CARRY1]]
+ ; CHECK: [[RES_HI:%[0-9]+]]:_(s64), {{%.*}}(s1) = G_UADDE %2, %3, [[CARRY2]]
+ ; CHECK-NOT: G_MERGE_VALUES
+ ; CHECK-NOT: G_UNMERGE_VALUES
+ ; CHECK: %x0 = COPY [[RES_LO]]
+ ; CHECK: %x1 = COPY [[RES_MI]]
+ ; CHECK: %x2 = COPY [[RES_HI]]
+
+ %0(s64) = COPY %x0
+ %1(s64) = COPY %x1
+ %2(s64) = COPY %x2
+ %3(s64) = COPY %x3
+ %4(s192) = G_MERGE_VALUES %0, %1, %2
+ %5(s192) = G_MERGE_VALUES %1, %2, %3
+ %6(s192) = G_ADD %4, %5
+ %7(s64), %8(s64), %9(s64) = G_UNMERGE_VALUES %6
+ %x0 = COPY %7
+ %x1 = COPY %8
+ %x2 = COPY %9
+...
+
+---
name: test_scalar_add_small
registers:
- { id: 0, class: _ }
@@ -124,3 +175,43 @@ body: |
%q0 = COPY %7
%q1 = COPY %8
...
+---
+name: test_vector_add_nonpow2
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+ - { id: 8, class: _ }
+ - { id: 9, class: _ }
+body: |
+ bb.0.entry:
+ liveins: %q0, %q1, %q2, %q3
+ ; CHECK-LABEL: name: test_vector_add_nonpow2
+ ; CHECK-NOT: G_EXTRACT
+ ; CHECK-NOT: G_SEQUENCE
+ ; CHECK: [[RES_LO:%[0-9]+]]:_(<2 x s64>) = G_ADD %0, %1
+ ; CHECK: [[RES_MI:%[0-9]+]]:_(<2 x s64>) = G_ADD %1, %2
+ ; CHECK: [[RES_HI:%[0-9]+]]:_(<2 x s64>) = G_ADD %2, %3
+ ; CHECK-NOT: G_EXTRACT
+ ; CHECK-NOT: G_SEQUENCE
+ ; CHECK: %q0 = COPY [[RES_LO]]
+ ; CHECK: %q1 = COPY [[RES_MI]]
+ ; CHECK: %q2 = COPY [[RES_HI]]
+
+ %0(<2 x s64>) = COPY %q0
+ %1(<2 x s64>) = COPY %q1
+ %2(<2 x s64>) = COPY %q2
+ %3(<2 x s64>) = COPY %q3
+ %4(<6 x s64>) = G_MERGE_VALUES %0, %1, %2
+ %5(<6 x s64>) = G_MERGE_VALUES %1, %2, %3
+ %6(<6 x s64>) = G_ADD %4, %5
+ %7(<2 x s64>), %8(<2 x s64>), %9(<2 x s64>) = G_UNMERGE_VALUES %6
+ %q0 = COPY %7
+ %q1 = COPY %8
+ %q2 = COPY %9
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir b/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir
index 7432b6761b7..405e6b54663 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir
@@ -9,6 +9,7 @@
define void @test_inserts_4() { ret void }
define void @test_inserts_5() { ret void }
define void @test_inserts_6() { ret void }
+ define void @test_inserts_nonpow2() { ret void }
...
---
@@ -141,3 +142,21 @@ body: |
%4:_(s128) = G_INSERT %3, %2, 32
RET_ReallyLR
...
+
+---
+name: test_inserts_nonpow2
+body: |
+ bb.0:
+ liveins: %x0, %x1, %x2
+
+
+ ; CHECK-LABEL: name: test_inserts_nonpow2
+ ; CHECK: %5:_(s192) = G_MERGE_VALUES %3(s64), %1(s64), %2(s64)
+ %0:_(s64) = COPY %x0
+ %1:_(s64) = COPY %x1
+ %2:_(s64) = COPY %x2
+ %3:_(s64) = COPY %x3
+ %4:_(s192) = G_MERGE_VALUES %0, %1, %2
+ %5:_(s192) = G_INSERT %4, %3, 0
+ RET_ReallyLR
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir b/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir
index c7b7ec9b6fe..33b48351106 100644
--- a/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir
+++ b/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir
@@ -15,11 +15,11 @@ body: |
%1:gpr(s64) = G_IMPLICIT_DEF
; CHECK: body:
- ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, 15
+ ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32
; CHECK: %2:gpr64 = BFMXri %1, [[TMP]], 0, 31
%2:gpr(s64) = G_INSERT %1, %0, 0
- ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, 15
+ ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32
; CHECK: %3:gpr64 = BFMXri %1, [[TMP]], 51, 31
%3:gpr(s64) = G_INSERT %1, %0, 13
diff --git a/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir b/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir
index 2c2e475a87a..bd75c4e661e 100644
--- a/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir
+++ b/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir
@@ -33,7 +33,7 @@ body: |
; CHECK-LABEL: name: anyext_s64_from_s32
; CHECK: [[COPY:%[0-9]+]]:gpr32all = COPY %w0
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64all = SUBREG_TO_REG 0, [[COPY]], 15
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64all = SUBREG_TO_REG 0, [[COPY]], %subreg.sub_32
; CHECK: [[COPY1:%[0-9]+]]:gpr64all = COPY [[SUBREG_TO_REG]]
; CHECK: %x0 = COPY [[COPY1]]
%0(s32) = COPY %w0
@@ -80,7 +80,7 @@ body: |
; CHECK-LABEL: name: zext_s64_from_s32
; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], 15
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.sub_32
; CHECK: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri [[SUBREG_TO_REG]], 0, 31
; CHECK: %x0 = COPY [[UBFMXri]]
%0(s32) = COPY %w0
@@ -177,7 +177,7 @@ body: |
; CHECK-LABEL: name: sext_s64_from_s32
; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], 15
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.sub_32
; CHECK: [[SBFMXri:%[0-9]+]]:gpr64 = SBFMXri [[SUBREG_TO_REG]], 0, 31
; CHECK: %x0 = COPY [[SBFMXri]]
%0(s32) = COPY %w0
diff --git a/test/CodeGen/AArch64/dwarf-cfi.ll b/test/CodeGen/AArch64/dwarf-cfi.ll
new file mode 100644
index 00000000000..a75bcd19c69
--- /dev/null
+++ b/test/CodeGen/AArch64/dwarf-cfi.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple aarch64-windows-gnu -filetype=asm -o - %s | FileCheck %s
+
+define void @_Z1gv() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+ invoke void @_Z1fv()
+ to label %try.cont unwind label %lpad
+
+lpad:
+ %0 = landingpad { i8*, i32 }
+ catch i8* null
+ %1 = extractvalue { i8*, i32 } %0, 0
+ %2 = tail call i8* @__cxa_begin_catch(i8* %1) #2
+ tail call void @__cxa_end_catch()
+ br label %try.cont
+
+try.cont:
+ ret void
+}
+
+declare void @_Z1fv()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+; CHECK-LABEL: _Z1gv:
+; CHECK: .cfi_startproc
+; CHECK: .cfi_personality 0, __gxx_personality_v0
+; CHECK: .cfi_lsda 0, .Lexception0
+; CHECK: str x30, [sp, #-16]!
+; CHECK: .cfi_def_cfa_offset 16
+; CHECK: .cfi_offset w30, -16
+; CHECK: ldr x30, [sp], #16
+; CHECK: .cfi_endproc
diff --git a/test/CodeGen/AArch64/recp-fastmath.ll b/test/CodeGen/AArch64/recp-fastmath.ll
index 38e0fb360e4..4776931cf06 100644
--- a/test/CodeGen/AArch64/recp-fastmath.ll
+++ b/test/CodeGen/AArch64/recp-fastmath.ll
@@ -18,6 +18,8 @@ define float @frecp1(float %x) #1 {
; CHECK-NEXT: BB#0
; CHECK-NEXT: frecpe [[R:s[0-7]]]
; CHECK-NEXT: frecps {{s[0-7](, s[0-7])?}}, [[R]]
+; CHECK: frecps {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}}
+; CHECK-NOT: frecps {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}}
}
define <2 x float> @f2recp0(<2 x float> %x) #0 {
@@ -38,6 +40,8 @@ define <2 x float> @f2recp1(<2 x float> %x) #1 {
; CHECK-NEXT: BB#0
; CHECK-NEXT: frecpe [[R:v[0-7]\.2s]]
; CHECK-NEXT: frecps {{v[0-7]\.2s(, v[0-7].2s)?}}, [[R]]
+; CHECK: frecps {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}}
+; CHECK-NOT: frecps {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}}
}
define <4 x float> @f4recp0(<4 x float> %x) #0 {
@@ -58,6 +62,8 @@ define <4 x float> @f4recp1(<4 x float> %x) #1 {
; CHECK-NEXT: BB#0
; CHECK-NEXT: frecpe [[R:v[0-7]\.4s]]
; CHECK-NEXT: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[R]]
+; CHECK: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
}
define <8 x float> @f8recp0(<8 x float> %x) #0 {
@@ -77,10 +83,12 @@ define <8 x float> @f8recp1(<8 x float> %x) #1 {
; CHECK-LABEL: f8recp1:
; CHECK-NEXT: BB#0
-; CHECK-NEXT: frecpe [[RA:v[0-7]\.4s]]
-; CHECK-NEXT: frecpe [[RB:v[0-7]\.4s]]
-; CHECK-NEXT: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[RA]]
-; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[RB]]
+; CHECK-NEXT: frecpe [[R:v[0-7]\.4s]]
+; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[R]]
+; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, {{v[0-7]\.4s}}
+; CHECK: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
}
define double @drecp0(double %x) #0 {
@@ -101,6 +109,9 @@ define double @drecp1(double %x) #1 {
; CHECK-NEXT: BB#0
; CHECK-NEXT: frecpe [[R:d[0-7]]]
; CHECK-NEXT: frecps {{d[0-7](, d[0-7])?}}, [[R]]
+; CHECK: frecps {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
+; CHECK: frecps {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
+; CHECK-NOT: frecps {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
}
define <2 x double> @d2recp0(<2 x double> %x) #0 {
@@ -121,6 +132,9 @@ define <2 x double> @d2recp1(<2 x double> %x) #1 {
; CHECK-NEXT: BB#0
; CHECK-NEXT: frecpe [[R:v[0-7]\.2d]]
; CHECK-NEXT: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[R]]
+; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
}
define <4 x double> @d4recp0(<4 x double> %x) #0 {
@@ -140,10 +154,14 @@ define <4 x double> @d4recp1(<4 x double> %x) #1 {
; CHECK-LABEL: d4recp1:
; CHECK-NEXT: BB#0
-; CHECK-NEXT: frecpe [[RA:v[0-7]\.2d]]
-; CHECK-NEXT: frecpe [[RB:v[0-7]\.2d]]
-; CHECK-NEXT: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[RA]]
-; CHECK: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[RB]]
+; CHECK-NEXT: frecpe [[R:v[0-7]\.2d]]
+; CHECK: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[R]]
+; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
}
attributes #0 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AArch64/sqrt-fastmath.ll b/test/CodeGen/AArch64/sqrt-fastmath.ll
index 079562c0581..4dd0516faf0 100644
--- a/test/CodeGen/AArch64/sqrt-fastmath.ll
+++ b/test/CodeGen/AArch64/sqrt-fastmath.ll
@@ -22,7 +22,9 @@ define float @fsqrt(float %a) #0 {
; CHECK-NEXT: frsqrte [[RA:s[0-7]]]
; CHECK-NEXT: fmul [[RB:s[0-7]]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{s[0-7](, s[0-7])?}}, [[RB]]
-; CHECK: fcmp s0, #0
+; CHECK: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}}
+; CHECK-NOT: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}}
+; CHECK: fcmp {{s[0-7]}}, #0
}
define <2 x float> @f2sqrt(<2 x float> %a) #0 {
@@ -38,7 +40,9 @@ define <2 x float> @f2sqrt(<2 x float> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2s]]
; CHECK-NEXT: fmul [[RB:v[0-7]\.2s]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{v[0-7]\.2s(, v[0-7]\.2s)?}}, [[RB]]
-; CHECK: fcmeq {{v[0-7]\.2s, v0\.2s}}, #0
+; CHECK: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}}
+; CHECK-NOT: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}}
+; CHECK: fcmeq {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, #0
}
define <4 x float> @f4sqrt(<4 x float> %a) #0 {
@@ -54,7 +58,9 @@ define <4 x float> @f4sqrt(<4 x float> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]]
; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]]
-; CHECK: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0
+; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0
}
define <8 x float> @f8sqrt(<8 x float> %a) #0 {
@@ -69,9 +75,16 @@ define <8 x float> @f8sqrt(<8 x float> %a) #0 {
; CHECK-LABEL: f8sqrt:
; CHECK-NEXT: BB#0
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]]
-; CHECK: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]]
-; CHECK: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]]
-; CHECK: fcmeq {{v[0-7]\.4s, v[0-1]\.4s}}, #0
+; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]]
+; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]]
+; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0
+; CHECK: frsqrte [[RC:v[0-7]\.4s]]
+; CHECK-NEXT: fmul [[RD:v[0-7]\.4s]], [[RC]], [[RC]]
+; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RD]]
+; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0
}
define double @dsqrt(double %a) #0 {
@@ -87,7 +100,10 @@ define double @dsqrt(double %a) #0 {
; CHECK-NEXT: frsqrte [[RA:d[0-7]]]
; CHECK-NEXT: fmul [[RB:d[0-7]]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{d[0-7](, d[0-7])?}}, [[RB]]
-; CHECK: fcmp d0, #0
+; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
+; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
+; CHECK-NOT: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
+; CHECK: fcmp {{d[0-7]}}, #0
}
define <2 x double> @d2sqrt(<2 x double> %a) #0 {
@@ -103,7 +119,10 @@ define <2 x double> @d2sqrt(<2 x double> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]]
; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]]
-; CHECK: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0
}
define <4 x double> @d4sqrt(<4 x double> %a) #0 {
@@ -118,9 +137,19 @@ define <4 x double> @d4sqrt(<4 x double> %a) #0 {
; CHECK-LABEL: d4sqrt:
; CHECK-NEXT: BB#0
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]]
-; CHECK: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]]
-; CHECK: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]]
-; CHECK: fcmeq {{v[0-7]\.2d, v[0-1]\.2d}}, #0
+; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]]
+; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]]
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0
+; CHECK: frsqrte [[RC:v[0-7]\.2d]]
+; CHECK-NEXT: fmul [[RD:v[0-7]\.2d]], [[RC]], [[RC]]
+; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RD]]
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0
}
define float @frsqrt(float %a) #0 {
@@ -137,6 +166,8 @@ define float @frsqrt(float %a) #0 {
; CHECK-NEXT: frsqrte [[RA:s[0-7]]]
; CHECK-NEXT: fmul [[RB:s[0-7]]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{s[0-7](, s[0-7])?}}, [[RB]]
+; CHECK: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}}
+; CHECK-NOT: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}}
; CHECK-NOT: fcmp {{s[0-7]}}, #0
}
@@ -154,7 +185,9 @@ define <2 x float> @f2rsqrt(<2 x float> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2s]]
; CHECK-NEXT: fmul [[RB:v[0-7]\.2s]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{v[0-7]\.2s(, v[0-7]\.2s)?}}, [[RB]]
-; CHECK-NOT: fcmeq {{v[0-7]\.2s, v0\.2s}}, #0
+; CHECK: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}}
+; CHECK-NOT: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}}
+; CHECK-NOT: fcmeq {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, #0
}
define <4 x float> @f4rsqrt(<4 x float> %a) #0 {
@@ -171,7 +204,9 @@ define <4 x float> @f4rsqrt(<4 x float> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]]
; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]]
-; CHECK-NOT: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0
+; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0
}
define <8 x float> @f8rsqrt(<8 x float> %a) #0 {
@@ -189,7 +224,11 @@ define <8 x float> @f8rsqrt(<8 x float> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]]
; CHECK: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]]
; CHECK: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]]
-; CHECK-NOT: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0
+; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0
}
define double @drsqrt(double %a) #0 {
@@ -206,6 +245,9 @@ define double @drsqrt(double %a) #0 {
; CHECK-NEXT: frsqrte [[RA:d[0-7]]]
; CHECK-NEXT: fmul [[RB:d[0-7]]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{d[0-7](, d[0-7])?}}, [[RB]]
+; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
+; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
+; CHECK-NOT: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
; CHECK-NOT: fcmp d0, #0
}
@@ -223,7 +265,10 @@ define <2 x double> @d2rsqrt(<2 x double> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]]
; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]]
-; CHECK-NOT: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0
}
define <4 x double> @d4rsqrt(<4 x double> %a) #0 {
@@ -241,7 +286,13 @@ define <4 x double> @d4rsqrt(<4 x double> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]]
; CHECK: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]]
; CHECK: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]]
-; CHECK-NOT: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0
}
attributes #0 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
index 4c05383615a..70e2b5e4ae2 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
@@ -44,28 +44,28 @@ regBankSelected: true
# Max immediate for CI
# SIVI: [[K_LO:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967292
# SIVI: [[K_HI:%[0-9]+]]:sreg_32 = S_MOV_B32 3
-# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2
+# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], %subreg.sub0, [[K_HI]], %subreg.sub1
# SIVI-DAG: [[K_SUB0:%[0-9]+]]:sgpr_32 = COPY [[K]].sub0
# SIVI-DAG: [[PTR_LO:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub0
# SIVI: [[ADD_PTR_LO:%[0-9]+]]:sreg_32 = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
# SIVI-DAG: [[K_SUB1:%[0-9]+]]:sgpr_32 = COPY [[K]].sub1
# SIVI-DAG: [[PTR_HI:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub1
# SIVI: [[ADD_PTR_HI:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]]
-# SIVI: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2
+# SIVI: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], %subreg.sub0, [[ADD_PTR_HI]], %subreg.sub1
# SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0
# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 4294967295, 0
# Immediate overflow for CI
# GCN: [[K_LO:%[0-9]+]]:sreg_32 = S_MOV_B32 0
# GCN: [[K_HI:%[0-9]+]]:sreg_32 = S_MOV_B32 4
-# GCN: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2
+# GCN: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], %subreg.sub0, [[K_HI]], %subreg.sub1
# GCN-DAG: [[K_SUB0:%[0-9]+]]:sgpr_32 = COPY [[K]].sub0
# GCN-DAG: [[PTR_LO:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub0
# GCN: [[ADD_PTR_LO:%[0-9]+]]:sreg_32 = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
# GCN-DAG: [[K_SUB1:%[0-9]+]]:sgpr_32 = COPY [[K]].sub1
# GCN-DAG: [[PTR_HI:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub1
# GCN: [[ADD_PTR_HI:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]]
-# GCN: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2
+# GCN: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], %subreg.sub0, [[ADD_PTR_HI]], %subreg.sub1
# GCN: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0
# Max 32-bit byte offset
@@ -76,14 +76,14 @@ regBankSelected: true
# Overflow 32-bit byte offset
# SIVI: [[K_LO:%[0-9]+]]:sreg_32 = S_MOV_B32 0
# SIVI: [[K_HI:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2
+# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], %subreg.sub0, [[K_HI]], %subreg.sub1
# SIVI-DAG: [[K_SUB0:%[0-9]+]]:sgpr_32 = COPY [[K]].sub0
# SIVI-DAG: [[PTR_LO:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub0
# SIVI: [[ADD_PTR_LO:%[0-9]+]]:sreg_32 = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
# SIVI-DAG: [[K_SUB1:%[0-9]+]]:sgpr_32 = COPY [[K]].sub1
# SIVI-DAG: [[PTR_HI:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub1
# SIVI: [[ADD_PTR_HI:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]]
-# SIVI: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2
+# SIVI: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], %subreg.sub0, [[ADD_PTR_HI]], %subreg.sub1
# SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0
# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741824, 0
diff --git a/test/CodeGen/AMDGPU/detect-dead-lanes.mir b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
index b2f5e816b26..12460d25f3b 100644
--- a/test/CodeGen/AMDGPU/detect-dead-lanes.mir
+++ b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
@@ -6,7 +6,7 @@
# CHECK: S_NOP 0, implicit-def %0
# CHECK: S_NOP 0, implicit-def %1
# CHECK: S_NOP 0, implicit-def dead %2
-# CHECK: %3:sreg_128 = REG_SEQUENCE %0, {{[0-9]+}}, %1, {{[0-9]+}}, undef %2, {{[0-9]+}}
+# CHECK: %3:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, undef %2, %subreg.sub3
# CHECK: S_NOP 0, implicit %3.sub0
# CHECK: S_NOP 0, implicit %3.sub1
# CHECK: S_NOP 0, implicit undef %3.sub2
@@ -42,9 +42,9 @@ body: |
# Check defined lanes transfer; Includes checking for some special cases like
# undef operands or IMPLICIT_DEF definitions.
# CHECK-LABEL: name: test1
-# CHECK: %0:sreg_128 = REG_SEQUENCE %sgpr0, {{[0-9]+}}, %sgpr0, {{[0-9]+}}
-# CHECK: %1:sreg_128 = INSERT_SUBREG %0, %sgpr1, {{[0-9]+}}
-# CHECK: %2:sreg_64 = INSERT_SUBREG %0.sub2_sub3, %sgpr42, {{[0-9]+}}
+# CHECK: %0:sreg_128 = REG_SEQUENCE %sgpr0, %subreg.sub0, %sgpr0, %subreg.sub2
+# CHECK: %1:sreg_128 = INSERT_SUBREG %0, %sgpr1, %subreg.sub3
+# CHECK: %2:sreg_64 = INSERT_SUBREG %0.sub2_sub3, %sgpr42, %subreg.sub0
# CHECK: S_NOP 0, implicit %1.sub0
# CHECK: S_NOP 0, implicit undef %1.sub1
# CHECK: S_NOP 0, implicit %1.sub2
@@ -53,24 +53,24 @@ body: |
# CHECK: S_NOP 0, implicit undef %2.sub1
# CHECK: %3:sreg_32_xm0 = IMPLICIT_DEF
-# CHECK: %4:sreg_128 = INSERT_SUBREG %0, undef %3, {{[0-9]+}}
+# CHECK: %4:sreg_128 = INSERT_SUBREG %0, undef %3, %subreg.sub0
# CHECK: S_NOP 0, implicit undef %4.sub0
# CHECK: S_NOP 0, implicit undef %4.sub1
# CHECK: S_NOP 0, implicit %4.sub2
# CHECK: S_NOP 0, implicit undef %4.sub3
-# CHECK: %5:sreg_64 = EXTRACT_SUBREG %0, {{[0-9]+}}
-# CHECK: %6:sreg_32_xm0 = EXTRACT_SUBREG %5, {{[0-9]+}}
-# CHECK: %7:sreg_32_xm0 = EXTRACT_SUBREG %5, {{[0-9]+}}
+# CHECK: %5:sreg_64 = EXTRACT_SUBREG %0, %subreg.sub0_sub1
+# CHECK: %6:sreg_32_xm0 = EXTRACT_SUBREG %5, %subreg.sub0
+# CHECK: %7:sreg_32_xm0 = EXTRACT_SUBREG %5, %subreg.sub1
# CHECK: S_NOP 0, implicit %5
# CHECK: S_NOP 0, implicit %6
# CHECK: S_NOP 0, implicit undef %7
# CHECK: %8:sreg_64 = IMPLICIT_DEF
-# CHECK: %9:sreg_32_xm0 = EXTRACT_SUBREG undef %8, {{[0-9]+}}
+# CHECK: %9:sreg_32_xm0 = EXTRACT_SUBREG undef %8, %subreg.sub1
# CHECK: S_NOP 0, implicit undef %9
-# CHECK: %10:sreg_128 = EXTRACT_SUBREG undef %0, {{[0-9]+}}
+# CHECK: %10:sreg_128 = EXTRACT_SUBREG undef %0, %subreg.sub2_sub3
# CHECK: S_NOP 0, implicit undef %10
name: test1
registers:
@@ -125,29 +125,29 @@ body: |
# CHECK: S_NOP 0, implicit-def dead %0
# CHECK: S_NOP 0, implicit-def %1
# CHECK: S_NOP 0, implicit-def %2
-# CHECK: %3:sreg_128 = REG_SEQUENCE undef %0, {{[0-9]+}}, %1, {{[0-9]+}}, %2, {{[0-9]+}}
+# CHECK: %3:sreg_128 = REG_SEQUENCE undef %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2_sub3
# CHECK: S_NOP 0, implicit %3.sub1
# CHECK: S_NOP 0, implicit %3.sub3
# CHECK: S_NOP 0, implicit-def %4
# CHECK: S_NOP 0, implicit-def dead %5
-# CHECK: %6:sreg_64 = REG_SEQUENCE %4, {{[0-9]+}}, undef %5, {{[0-9]+}}
+# CHECK: %6:sreg_64 = REG_SEQUENCE %4, %subreg.sub0, undef %5, %subreg.sub1
# CHECK: S_NOP 0, implicit %6
# CHECK: S_NOP 0, implicit-def dead %7
# CHECK: S_NOP 0, implicit-def %8
-# CHECK: %9:sreg_128 = INSERT_SUBREG undef %7, %8, {{[0-9]+}}
+# CHECK: %9:sreg_128 = INSERT_SUBREG undef %7, %8, %subreg.sub2_sub3
# CHECK: S_NOP 0, implicit %9.sub2
# CHECK: S_NOP 0, implicit-def %10
# CHECK: S_NOP 0, implicit-def dead %11
-# CHECK: %12:sreg_128 = INSERT_SUBREG %10, undef %11, {{[0-9]+}}
+# CHECK: %12:sreg_128 = INSERT_SUBREG %10, undef %11, %subreg.sub0_sub1
# CHECK: S_NOP 0, implicit %12.sub3
# CHECK: S_NOP 0, implicit-def %13
# CHECK: S_NOP 0, implicit-def dead %14
-# CHECK: %15:sreg_128 = REG_SEQUENCE %13, {{[0-9]+}}, undef %14, {{[0-9]+}}
-# CHECK: %16:sreg_64 = EXTRACT_SUBREG %15, {{[0-9]+}}
+# CHECK: %15:sreg_128 = REG_SEQUENCE %13, %subreg.sub0_sub1, undef %14, %subreg.sub2_sub3
+# CHECK: %16:sreg_64 = EXTRACT_SUBREG %15, %subreg.sub0_sub1
# CHECK: S_NOP 0, implicit %16.sub1
name: test2
@@ -245,7 +245,7 @@ body: |
# used.
# CHECK-LABEL: name: test5
# CHECK: S_NOP 0, implicit-def %0
-# CHECK: %1:sreg_64 = REG_SEQUENCE undef %0, {{[0-9]+}}, %0, {{[0-9]+}}
+# CHECK: %1:sreg_64 = REG_SEQUENCE undef %0, %subreg.sub0, %0, %subreg.sub1
# CHECK: S_NOP 0, implicit %1.sub1
name: test5
tracksRegLiveness: true
@@ -265,7 +265,7 @@ body: |
# CHECK: S_NOP 0, implicit-def %0
# CHECK: S_NOP 0, implicit-def dead %1
# CHECK: S_NOP 0, implicit-def dead %2
-# CHECK: %3:sreg_128 = REG_SEQUENCE %0, {{[0-9]+}}, undef %1, {{[0-9]+}}, undef %2, {{[0-9]+}}
+# CHECK: %3:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, undef %1, %subreg.sub1, undef %2, %subreg.sub2
# CHECK: bb.1:
# CHECK: %4:sreg_128 = PHI %3, %bb.0, %5, %bb.1
@@ -315,12 +315,12 @@ body: |
# CHECK: S_NOP 0, implicit-def %1
# CHECK: S_NOP 0, implicit-def dead %2
# CHECK: S_NOP 0, implicit-def %3
-# CHECK: %4:sreg_128 = REG_SEQUENCE %0, {{[0-9]+}}, %1, {{[0-9]+}}, undef %2, {{[0-9]+}}, %3, {{[0-9]+}}
+# CHECK: %4:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, undef %2, %subreg.sub2, %3, %subreg.sub3
# CHECK: bb.1:
# CHECK: %5:sreg_128 = PHI %4, %bb.0, %6, %bb.1
-# CHECK: %6:sreg_128 = REG_SEQUENCE %5.sub1, {{[0-9]+}}, %5.sub3, {{[0-9]+}}, undef %5.sub2, {{[0-9]+}}, %5.sub0, {{[0-9]+}}
+# CHECK: %6:sreg_128 = REG_SEQUENCE %5.sub1, %subreg.sub0, %5.sub3, %subreg.sub1, undef %5.sub2, %subreg.sub2, %5.sub0, %subreg.sub3
# CHECK: bb.2:
# CHECK: S_NOP 0, implicit %6.sub3
@@ -361,12 +361,12 @@ body: |
# CHECK-LABEL: name: loop2
# CHECK: bb.0:
# CHECK: S_NOP 0, implicit-def %0
-# CHECK: %1:sreg_128 = REG_SEQUENCE %0, {{[0-9]+}}
+# CHECK: %1:sreg_128 = REG_SEQUENCE %0, %subreg.sub0
# CHECK: bb.1:
# CHECK: %2:sreg_128 = PHI %1, %bb.0, %3, %bb.1
-# CHECK: %3:sreg_128 = REG_SEQUENCE %2.sub3, {{[0-9]+}}, undef %2.sub1, {{[0-9]+}}, %2.sub0, {{[0-9]+}}, %2.sub2, {{[0-9]+}}
+# CHECK: %3:sreg_128 = REG_SEQUENCE %2.sub3, %subreg.sub0, undef %2.sub1, %subreg.sub1, %2.sub0, %subreg.sub2, %2.sub2, %subreg.sub3
# CHECK: bb.2:
# CHECK: S_NOP 0, implicit %2.sub0
diff --git a/test/CodeGen/AMDGPU/mad_64_32.ll b/test/CodeGen/AMDGPU/mad_64_32.ll
new file mode 100644
index 00000000000..b4d9d928101
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -0,0 +1,168 @@
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+
+; GCN-LABEL: {{^}}mad_i64_i32_sextops:
+; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
+
+; SI: v_mul_lo_i32
+; SI: v_mul_hi_i32
+; SI: v_add_i32
+; SI: v_addc_u32
+define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
+ %sext0 = sext i32 %arg0 to i64
+ %sext1 = sext i32 %arg1 to i64
+ %mul = mul i64 %sext0, %sext1
+ %mad = add i64 %mul, %arg2
+ ret i64 %mad
+}
+
+; GCN-LABEL: {{^}}mad_i64_i32_sextops_commute:
+; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
+
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_i32
+; SI: v_add_i32
+; SI: v_addc_u32
+define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
+ %sext0 = sext i32 %arg0 to i64
+ %sext1 = sext i32 %arg1 to i64
+ %mul = mul i64 %sext0, %sext1
+ %mad = add i64 %arg2, %mul
+ ret i64 %mad
+}
+
+; GCN-LABEL: {{^}}mad_u64_u32_zextops:
+; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3]
+
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI: v_add_i32
+; SI: v_addc_u32
+define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
+ %sext0 = zext i32 %arg0 to i64
+ %sext1 = zext i32 %arg1 to i64
+ %mul = mul i64 %sext0, %sext1
+ %mad = add i64 %mul, %arg2
+ ret i64 %mad
+}
+
+; GCN-LABEL: {{^}}mad_u64_u32_zextops_commute:
+; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3]
+
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI: v_add_i32
+; SI: v_addc_u32
+define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
+ %sext0 = zext i32 %arg0 to i64
+ %sext1 = zext i32 %arg1 to i64
+ %mul = mul i64 %sext0, %sext1
+ %mad = add i64 %arg2, %mul
+ ret i64 %mad
+}
+
+
+
+
+
+
+; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i128:
+; CI: v_mad_u64_u32
+; CI: v_mad_u64_u32
+; CI: v_mad_u64_u32
+; CI: v_mad_i64_i32
+
+; SI-NOT: v_mad_
+define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
+ %sext0 = sext i32 %arg0 to i128
+ %sext1 = sext i32 %arg1 to i128
+ %mul = mul i128 %sext0, %sext1
+ %mad = add i128 %mul, %arg2
+ ret i128 %mad
+}
+
+; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i63:
+; CI: v_lshl_b64
+; CI: v_ashr
+; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
+
+; SI-NOT: v_mad_u64_u32
+define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
+ %sext0 = sext i32 %arg0 to i63
+ %sext1 = sext i32 %arg1 to i63
+ %mul = mul i63 %sext0, %sext1
+ %mad = add i63 %mul, %arg2
+ ret i63 %mad
+}
+
+; GCN-LABEL: {{^}}mad_i64_i32_sextops_i31_i63:
+; CI: v_lshl_b64
+; CI: v_ashr_i64
+; CI: v_bfe_i32 v1, v1, 0, 31
+; CI: v_bfe_i32 v0, v0, 0, 31
+; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
+define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
+ %sext0 = sext i31 %arg0 to i63
+ %sext1 = sext i31 %arg1 to i63
+ %mul = mul i63 %sext0, %sext1
+ %mad = add i63 %mul, %arg2
+ ret i63 %mad
+}
+
+; GCN-LABEL: {{^}}mad_u64_u32_bitops:
+; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v2, v[4:5]
+define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
+ %trunc.lhs = and i64 %arg0, 4294967295
+ %trunc.rhs = and i64 %arg1, 4294967295
+ %mul = mul i64 %trunc.lhs, %trunc.rhs
+ %add = add i64 %mul, %arg2
+ ret i64 %add
+}
+
+; GCN-LABEL: {{^}}mad_u64_u32_bitops_lhs_mask_small:
+; GCN-NOT: v_mad_
+define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
+ %trunc.lhs = and i64 %arg0, 8589934591
+ %trunc.rhs = and i64 %arg1, 4294967295
+ %mul = mul i64 %trunc.lhs, %trunc.rhs
+ %add = add i64 %mul, %arg2
+ ret i64 %add
+}
+
+; GCN-LABEL: {{^}}mad_u64_u32_bitops_rhs_mask_small:
+; GCN-NOT: v_mad_
+define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
+ %trunc.lhs = and i64 %arg0, 4294967295
+ %trunc.rhs = and i64 %arg1, 8589934591
+ %mul = mul i64 %trunc.lhs, %trunc.rhs
+ %add = add i64 %mul, %arg2
+ ret i64 %add
+}
+
+; GCN-LABEL: {{^}}mad_i64_i32_bitops:
+; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v2, v[4:5]
+; SI-NOT: v_mad_
+define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
+ %shl.lhs = shl i64 %arg0, 32
+ %trunc.lhs = ashr i64 %shl.lhs, 32
+ %shl.rhs = shl i64 %arg1, 32
+ %trunc.rhs = ashr i64 %shl.rhs, 32
+ %mul = mul i64 %trunc.lhs, %trunc.rhs
+ %add = add i64 %mul, %arg2
+ ret i64 %add
+}
+
+; Example from bug report
+; GCN-LABEL: {{^}}mad_i64_i32_unpack_i64ops:
+; CI: v_mad_u64_u32 v[0:1], s[6:7], v1, v0, v[0:1]
+; SI-NOT: v_mad_u64_u32
+define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
+ %tmp4 = lshr i64 %arg0, 32
+ %tmp5 = and i64 %arg0, 4294967295
+ %mul = mul nuw i64 %tmp4, %tmp5
+ %mad = add i64 %mul, %arg0
+ ret i64 %mad
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll
index a0290789175..555c65a6ffe 100644
--- a/test/CodeGen/AMDGPU/mul.ll
+++ b/test/CodeGen/AMDGPU/mul.ll
@@ -1,6 +1,6 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
; mul24 and mad24 are affected
@@ -8,8 +8,8 @@
; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
@@ -26,10 +26,10 @@ define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32
; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
@@ -41,10 +41,10 @@ define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> a
}
; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32:
-; SI: s_load_dword
-; SI: s_load_dword
-; SI: s_mul_i32
-; SI: buffer_store_dword
+; GCN: s_load_dword
+; GCN: s_load_dword
+; GCN: s_mul_i32
+; GCN: buffer_store_dword
define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
%mul = mul i64 %b, %a
%trunc = trunc i64 %mul to i32
@@ -53,10 +53,10 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a
}
; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32:
-; SI: s_load_dword
-; SI: s_load_dword
-; SI: v_mul_lo_i32
-; SI: buffer_store_dword
+; GCN: s_load_dword
+; GCN: s_load_dword
+; GCN: v_mul_lo_i32
+; GCN: buffer_store_dword
define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
%a = load i64, i64 addrspace(1)* %aptr, align 8
%b = load i64, i64 addrspace(1)* %bptr, align 8
@@ -71,8 +71,8 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 ad
; FUNC-LABEL: {{^}}mul64_sext_c:
; EG-DAG: MULLO_INT
; EG-DAG: MULHI_INT
-; SI-DAG: s_mul_i32
-; SI-DAG: v_mul_hi_i32
+; GCN-DAG: s_mul_i32
+; GCN-DAG: v_mul_hi_i32
define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
entry:
%0 = sext i32 %in to i64
@@ -84,9 +84,9 @@ entry:
; FUNC-LABEL: {{^}}v_mul64_sext_c:
; EG-DAG: MULLO_INT
; EG-DAG: MULHI_INT
-; SI-DAG: v_mul_lo_i32
-; SI-DAG: v_mul_hi_i32
-; SI: s_endpgm
+; GCN-DAG: v_mul_lo_i32
+; GCN-DAG: v_mul_hi_i32
+; GCN: s_endpgm
define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
%val = load i32, i32 addrspace(1)* %in, align 4
%ext = sext i32 %val to i64
@@ -96,9 +96,9 @@ define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(
}
; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm:
-; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
-; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
-; SI: s_endpgm
+; GCN-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
+; GCN-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
+; GCN: s_endpgm
define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
%val = load i32, i32 addrspace(1)* %in, align 4
%ext = sext i32 %val to i64
@@ -108,12 +108,12 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 a
}
; FUNC-LABEL: {{^}}s_mul_i32:
-; SI: s_load_dword [[SRC0:s[0-9]+]],
-; SI: s_load_dword [[SRC1:s[0-9]+]],
-; SI: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]]
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
-; SI: s_endpgm
+; GCN: s_load_dword [[SRC0:s[0-9]+]],
+; GCN: s_load_dword [[SRC1:s[0-9]+]],
+; GCN: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]]
+; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; GCN: buffer_store_dword [[VRESULT]],
+; GCN: s_endpgm
define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%mul = mul i32 %a, %b
store i32 %mul, i32 addrspace(1)* %out, align 4
@@ -121,7 +121,7 @@ define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nou
}
; FUNC-LABEL: {{^}}v_mul_i32:
-; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%a = load i32, i32 addrspace(1)* %in
@@ -146,7 +146,7 @@ define amdgpu_kernel void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nou
}
; FUNC-LABEL: {{^}}v_mul_i64:
-; SI: v_mul_lo_i32
+; GCN: v_mul_lo_i32
define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
%a = load i64, i64 addrspace(1)* %aptr, align 8
%b = load i64, i64 addrspace(1)* %bptr, align 8
@@ -156,7 +156,7 @@ define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
}
; FUNC-LABEL: {{^}}mul32_in_branch:
-; SI: s_mul_i32
+; GCN: s_mul_i32
define amdgpu_kernel void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
entry:
%0 = icmp eq i32 %a, 0
@@ -177,9 +177,9 @@ endif:
}
; FUNC-LABEL: {{^}}mul64_in_branch:
-; SI-DAG: s_mul_i32
-; SI-DAG: v_mul_hi_u32
-; SI: s_endpgm
+; GCN-DAG: s_mul_i32
+; GCN-DAG: v_mul_hi_u32
+; GCN: s_endpgm
define amdgpu_kernel void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
entry:
%0 = icmp eq i64 %a, 0
@@ -201,29 +201,41 @@ endif:
; FIXME: Load dwordx4
; FUNC-LABEL: {{^}}s_mul_i128:
-; SI: s_load_dwordx2
-; SI: s_load_dwordx2
-; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
; SI: v_mul_hi_u32
; SI: v_mul_hi_u32
; SI: s_mul_i32
; SI: v_mul_hi_u32
; SI: s_mul_i32
+
; SI-DAG: s_mul_i32
; SI-DAG: v_mul_hi_u32
; SI-DAG: v_mul_hi_u32
; SI-DAG: s_mul_i32
; SI-DAG: s_mul_i32
; SI-DAG: v_mul_hi_u32
+
; SI: s_mul_i32
; SI: s_mul_i32
; SI: s_mul_i32
; SI: s_mul_i32
; SI: s_mul_i32
-; SI: buffer_store_dwordx4
+
+; VI: s_mul_i32
+; VI: v_mul_hi_u32
+; VI: v_mad_u64_u32
+; VI: s_mul_i32
+; VI: v_mul_hi_u32
+; VI: v_mad_u64_u32
+; VI: v_mad_u64_u32
+
+
+; GCN: buffer_store_dwordx4
define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
%mul = mul i128 %a, %b
store i128 %mul, i128 addrspace(1)* %out
@@ -231,18 +243,19 @@ define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b)
}
; FUNC-LABEL: {{^}}v_mul_i128:
-; SI: {{buffer|flat}}_load_dwordx4
-; SI: {{buffer|flat}}_load_dwordx4
+; GCN: {{buffer|flat}}_load_dwordx4
+; GCN: {{buffer|flat}}_load_dwordx4
+
+; GCN-DAG: v_mul_lo_i32
+; GCN-DAG: v_mul_hi_u32
+; GCN-DAG: v_mul_hi_u32
+; GCN-DAG: v_mul_lo_i32
+; GCN-DAG: v_mul_hi_u32
+; GCN-DAG: v_mul_hi_u32
+; GCN-DAG: v_mul_lo_i32
+; GCN-DAG: v_mul_lo_i32
+; GCN-DAG: v_add_i32_e32
-; SI-DAG: v_mul_lo_i32
-; SI-DAG: v_mul_hi_u32
-; SI-DAG: v_mul_hi_u32
-; SI-DAG: v_mul_lo_i32
-; SI-DAG: v_mul_hi_u32
-; SI-DAG: v_mul_hi_u32
-; SI-DAG: v_mul_lo_i32
-; SI-DAG: v_mul_lo_i32
-; SI: v_add_i32_e32
; SI-DAG: v_mul_hi_u32
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_hi_u32
@@ -252,7 +265,11 @@ define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b)
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_lo_i32
-; SI: {{buffer|flat}}_store_dwordx4
+; VI-DAG: v_mad_u64_u32
+; VI: v_mad_u64_u32
+; VI: v_mad_u64_u32
+
+; GCN: {{buffer|flat}}_store_dwordx4
define amdgpu_kernel void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {
%tid = call i32 @llvm.r600.read.tidig.x()
%gep.a = getelementptr inbounds i128, i128 addrspace(1)* %aptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
index 6c6590a154a..9702d18d905 100644
--- a/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
+++ b/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
@@ -5,19 +5,19 @@
# GCN-LABEL: {{^}}name: const_to_sgpr{{$}}
# GCN: %[[HI:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
# GCN-NEXT: %[[LO:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576
-# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2
+# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], %subreg.sub0, killed %[[HI]], %subreg.sub1
# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec
# GCN-LABEL: {{^}}name: const_to_sgpr_multiple_use{{$}}
# GCN: %[[HI:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
# GCN-NEXT: %[[LO:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576
-# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2
+# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], %subreg.sub0, killed %[[HI]], %subreg.sub1
# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec
# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec
# GCN-LABEL: {{^}}name: const_to_sgpr_subreg{{$}}
-# GCN: %[[OP0:[0-9]+]]:vreg_64 = REG_SEQUENCE killed %{{[0-9]+}}, 1, killed %{{[0-9]+}}, 2
+# GCN: %[[OP0:[0-9]+]]:vreg_64 = REG_SEQUENCE killed %{{[0-9]+}}, %subreg.sub0, killed %{{[0-9]+}}, %subreg.sub1
# GCN-NEXT: V_CMP_LT_U32_e64 killed %[[OP0]].sub0, 12, implicit %exec
--- |
@@ -109,7 +109,7 @@ body: |
%8 = S_LOAD_DWORDX2_IMM %3, 11, 0
%6 = COPY %7
%9 = S_MOV_B32 0
- %10 = REG_SEQUENCE %2, 1, killed %9, 2
+ %10 = REG_SEQUENCE %2, %subreg.sub0, killed %9, %subreg.sub1
%0 = COPY %10
%11 = COPY %10.sub0
%12 = COPY %10.sub1
@@ -117,10 +117,10 @@ body: |
%14 = COPY %8.sub1
%15 = S_ADD_U32 killed %11, killed %13, implicit-def %scc
%16 = S_ADDC_U32 killed %12, killed %14, implicit-def dead %scc, implicit %scc
- %17 = REG_SEQUENCE killed %15, 1, killed %16, 2
+ %17 = REG_SEQUENCE killed %15, %subreg.sub0, killed %16, %subreg.sub1
%18 = S_MOV_B32 0
%19 = S_MOV_B32 1048576
- %20 = REG_SEQUENCE killed %19, 1, killed %18, 2
+ %20 = REG_SEQUENCE killed %19, %subreg.sub0, killed %18, %subreg.sub1
%22 = COPY killed %20
%21 = V_CMP_LT_U64_e64 killed %17, %22, implicit %exec
%1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
@@ -133,7 +133,7 @@ body: |
%24 = S_LSHL_B64 %0, killed %23, implicit-def dead %scc
%25 = S_MOV_B32 61440
%26 = S_MOV_B32 0
- %27 = REG_SEQUENCE killed %26, 1, killed %25, 2
+ %27 = REG_SEQUENCE killed %26, %subreg.sub0, killed %25, %subreg.sub1
%28 = REG_SEQUENCE %6, 17, killed %27, 18
%29 = V_MOV_B32_e32 0, implicit %exec
%30 = COPY %24
@@ -208,7 +208,7 @@ body: |
%9 = S_LOAD_DWORDX2_IMM %3, 13, 0
%6 = COPY %7
%10 = S_MOV_B32 0
- %11 = REG_SEQUENCE %2, 1, killed %10, 2
+ %11 = REG_SEQUENCE %2, %subreg.sub0, killed %10, %subreg.sub1
%0 = COPY %11
%12 = COPY %11.sub0
%13 = COPY %11.sub1
@@ -216,15 +216,15 @@ body: |
%15 = COPY %8.sub1
%16 = S_ADD_U32 %12, killed %14, implicit-def %scc
%17 = S_ADDC_U32 %13, killed %15, implicit-def dead %scc, implicit %scc
- %18 = REG_SEQUENCE killed %16, 1, killed %17, 2
+ %18 = REG_SEQUENCE killed %16, %subreg.sub0, killed %17, %subreg.sub1
%19 = COPY %9.sub0
%20 = COPY %9.sub1
%21 = S_ADD_U32 %12, killed %19, implicit-def %scc
%22 = S_ADDC_U32 %13, killed %20, implicit-def dead %scc, implicit %scc
- %23 = REG_SEQUENCE killed %21, 1, killed %22, 2
+ %23 = REG_SEQUENCE killed %21, %subreg.sub0, killed %22, %subreg.sub1
%24 = S_MOV_B32 0
%25 = S_MOV_B32 1048576
- %26 = REG_SEQUENCE killed %25, 1, killed %24, 2
+ %26 = REG_SEQUENCE killed %25, %subreg.sub0, killed %24, %subreg.sub1
%28 = COPY %26
%27 = V_CMP_LT_U64_e64 killed %18, %28, implicit %exec
%29 = V_CMP_LT_U64_e64 killed %23, %28, implicit %exec
@@ -239,7 +239,7 @@ body: |
%33 = S_LSHL_B64 %0, killed %32, implicit-def dead %scc
%34 = S_MOV_B32 61440
%35 = S_MOV_B32 0
- %36 = REG_SEQUENCE killed %35, 1, killed %34, 2
+ %36 = REG_SEQUENCE killed %35, %subreg.sub0, killed %34, %subreg.sub1
%37 = REG_SEQUENCE %6, 17, killed %36, 18
%38 = V_MOV_B32_e32 0, implicit %exec
%39 = COPY %33
@@ -304,7 +304,7 @@ body: |
%8 = S_LOAD_DWORDX2_IMM %3, 11, 0
%6 = COPY %7
%9 = S_MOV_B32 0
- %10 = REG_SEQUENCE %2, 1, killed %9, 2
+ %10 = REG_SEQUENCE %2, %subreg.sub0, killed %9, %subreg.sub1
%0 = COPY %10
%11 = COPY %10.sub0
%12 = COPY %10.sub1
@@ -312,10 +312,10 @@ body: |
%14 = COPY %8.sub1
%15 = S_ADD_U32 killed %11, killed %13, implicit-def %scc
%16 = S_ADDC_U32 killed %12, killed %14, implicit-def dead %scc, implicit %scc
- %17 = REG_SEQUENCE killed %15, 1, killed %16, 2
+ %17 = REG_SEQUENCE killed %15, %subreg.sub0, killed %16, %subreg.sub1
%18 = S_MOV_B32 12
%19 = S_MOV_B32 1048576
- %20 = REG_SEQUENCE killed %19, 1, killed %18, 2
+ %20 = REG_SEQUENCE killed %19, %subreg.sub0, killed %18, %subreg.sub1
%22 = COPY killed %20.sub1
%21 = V_CMP_LT_U32_e64 killed %17.sub0, %22, implicit %exec
%1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
@@ -328,7 +328,7 @@ body: |
%24 = S_LSHL_B64 %0, killed %23, implicit-def dead %scc
%25 = S_MOV_B32 61440
%26 = S_MOV_B32 0
- %27 = REG_SEQUENCE killed %26, 1, killed %25, 2
+ %27 = REG_SEQUENCE killed %26, %subreg.sub0, killed %25, %subreg.sub1
%28 = REG_SEQUENCE %6, 17, killed %27, 18
%29 = V_MOV_B32_e32 0, implicit %exec
%30 = COPY %24
diff --git a/test/CodeGen/AMDGPU/private-memory-r600.ll b/test/CodeGen/AMDGPU/private-memory-r600.ll
index 866cd16ec3b..65e72817429 100644
--- a/test/CodeGen/AMDGPU/private-memory-r600.ll
+++ b/test/CodeGen/AMDGPU/private-memory-r600.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
-; RUN: opt -S -mtriple=r600-unknown-unknown -mcpu=redwood -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: opt -S -mtriple=r600-unknown-unknown-amdgiz -mcpu=redwood -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+target datalayout = "A5"
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -18,19 +19,19 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
entry:
- %stack = alloca [5 x i32], align 4
+ %stack = alloca [5 x i32], align 4, addrspace(5)
%0 = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
- store i32 4, i32* %arrayidx1, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
+ store i32 4, i32 addrspace(5)* %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
- store i32 5, i32* %arrayidx3, align 4
- %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
- %2 = load i32, i32* %arrayidx10, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
+ store i32 5, i32 addrspace(5)* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
+ %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
store i32 %2, i32 addrspace(1)* %out, align 4
- %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
- %3 = load i32, i32* %arrayidx12
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
+ %3 = load i32, i32 addrspace(5)* %arrayidx12
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
store i32 %3, i32 addrspace(1)* %arrayidx13
ret void
@@ -49,20 +50,20 @@ entry:
define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 {
entry:
- %a = alloca %struct.point
- %b = alloca %struct.point
- %a.x.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 0
- %a.y.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 1
- %b.x.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 0
- %b.y.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 1
- store i32 0, i32* %a.x.ptr
- store i32 1, i32* %a.y.ptr
- store i32 2, i32* %b.x.ptr
- store i32 3, i32* %b.y.ptr
- %a.indirect.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 0
- %b.indirect.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 0
- %a.indirect = load i32, i32* %a.indirect.ptr
- %b.indirect = load i32, i32* %b.indirect.ptr
+ %a = alloca %struct.point, addrspace(5)
+ %b = alloca %struct.point, addrspace(5)
+ %a.x.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0
+ %a.y.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 1
+ %b.x.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0
+ %b.y.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 1
+ store i32 0, i32 addrspace(5)* %a.x.ptr
+ store i32 1, i32 addrspace(5)* %a.y.ptr
+ store i32 2, i32 addrspace(5)* %b.x.ptr
+ store i32 3, i32 addrspace(5)* %b.y.ptr
+ %a.indirect.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0
+ %b.indirect.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0
+ %a.indirect = load i32, i32 addrspace(5)* %a.indirect.ptr
+ %b.indirect = load i32, i32 addrspace(5)* %b.indirect.ptr
%0 = add i32 %a.indirect, %b.indirect
store i32 %0, i32 addrspace(1)* %out
ret void
@@ -77,32 +78,32 @@ entry:
define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
entry:
- %prv_array_const = alloca [2 x i32]
- %prv_array = alloca [2 x i32]
+ %prv_array_const = alloca [2 x i32], addrspace(5)
+ %prv_array = alloca [2 x i32], addrspace(5)
%a = load i32, i32 addrspace(1)* %in
%b_src_ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
%b = load i32, i32 addrspace(1)* %b_src_ptr
- %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
- store i32 %a, i32* %a_dst_ptr
- %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1
- store i32 %b, i32* %b_dst_ptr
+ %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0
+ store i32 %a, i32 addrspace(5)* %a_dst_ptr
+ %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 1
+ store i32 %b, i32 addrspace(5)* %b_dst_ptr
br label %for.body
for.body:
%inc = phi i32 [0, %entry], [%count, %for.body]
- %x_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
- %x = load i32, i32* %x_ptr
- %y_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
- %y = load i32, i32* %y_ptr
+ %x_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0
+ %x = load i32, i32 addrspace(5)* %x_ptr
+ %y_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0
+ %y = load i32, i32 addrspace(5)* %y_ptr
%xy = add i32 %x, %y
- store i32 %xy, i32* %y_ptr
+ store i32 %xy, i32 addrspace(5)* %y_ptr
%count = add i32 %inc, 1
%done = icmp eq i32 %count, 4095
br i1 %done, label %for.end, label %for.body
for.end:
- %value_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
- %value = load i32, i32* %value_ptr
+ %value_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0
+ %value = load i32, i32 addrspace(5)* %value_ptr
store i32 %value, i32 addrspace(1)* %out
ret void
}
@@ -112,13 +113,13 @@ for.end:
; R600: MOVA_INT
define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
entry:
- %0 = alloca [2 x i16]
- %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
- %2 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 1
- store i16 0, i16* %1
- store i16 1, i16* %2
- %3 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 %index
- %4 = load i16, i16* %3
+ %0 = alloca [2 x i16], addrspace(5)
+ %1 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 0
+ %2 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 1
+ store i16 0, i16 addrspace(5)* %1
+ store i16 1, i16 addrspace(5)* %2
+ %3 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 %index
+ %4 = load i16, i16 addrspace(5)* %3
%5 = sext i16 %4 to i32
store i32 %5, i32 addrspace(1)* %out
ret void
@@ -129,13 +130,13 @@ entry:
; R600: MOVA_INT
define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
entry:
- %0 = alloca [2 x i8]
- %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
- %2 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 1
- store i8 0, i8* %1
- store i8 1, i8* %2
- %3 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 %index
- %4 = load i8, i8* %3
+ %0 = alloca [2 x i8], addrspace(5)
+ %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 0
+ %2 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 1
+ store i8 0, i8 addrspace(5)* %1
+ store i8 1, i8 addrspace(5)* %2
+ %3 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 %index
+ %4 = load i8, i8 addrspace(5)* %3
%5 = sext i8 %4 to i32
store i32 %5, i32 addrspace(1)* %out
ret void
@@ -150,13 +151,13 @@ entry:
; R600-NOT: MOV * TO.X
define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 {
entry:
- %0 = alloca [2 x i32]
- %1 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 0
- %2 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 1
- store i32 0, i32* %1
- store i32 1, i32* %2
- %3 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 %in
- %4 = load i32, i32* %3
+ %0 = alloca [2 x i32], addrspace(5)
+ %1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 0
+ %2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 1
+ store i32 0, i32 addrspace(5)* %1
+ store i32 1, i32 addrspace(5)* %2
+ %3 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 %in
+ %4 = load i32, i32 addrspace(5)* %3
%5 = call i32 @llvm.r600.read.tidig.x()
%6 = add i32 %4, %5
store i32 %6, i32 addrspace(1)* %out
@@ -171,22 +172,22 @@ entry:
; R600-NOT: [[CHAN]]+
define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
entry:
- %0 = alloca [3 x i8], align 1
- %1 = alloca [2 x i8], align 1
- %2 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 0
- %3 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 1
- %4 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 2
- %5 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 0
- %6 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 1
- store i8 0, i8* %2
- store i8 1, i8* %3
- store i8 2, i8* %4
- store i8 1, i8* %5
- store i8 0, i8* %6
- %7 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 %in
- %8 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 %in
- %9 = load i8, i8* %7
- %10 = load i8, i8* %8
+ %0 = alloca [3 x i8], align 1, addrspace(5)
+ %1 = alloca [2 x i8], align 1, addrspace(5)
+ %2 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 0
+ %3 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 1
+ %4 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 2
+ %5 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 0
+ %6 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 1
+ store i8 0, i8 addrspace(5)* %2
+ store i8 1, i8 addrspace(5)* %3
+ store i8 2, i8 addrspace(5)* %4
+ store i8 1, i8 addrspace(5)* %5
+ store i8 0, i8 addrspace(5)* %6
+ %7 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 %in
+ %8 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 %in
+ %9 = load i8, i8 addrspace(5)* %7
+ %10 = load i8, i8 addrspace(5)* %8
%11 = add i8 %9, %10
%12 = sext i8 %11 to i32
store i32 %12, i32 addrspace(1)* %out
@@ -195,13 +196,13 @@ entry:
define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
entry:
- %alloca = alloca [2 x [2 x i8]]
- %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
- %gep1 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
- store i8 0, i8* %gep0
- store i8 1, i8* %gep1
- %gep2 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
- %load = load i8, i8* %gep2
+ %alloca = alloca [2 x [2 x i8]], addrspace(5)
+ %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 1
+ store i8 0, i8 addrspace(5)* %gep0
+ store i8 1, i8 addrspace(5)* %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index
+ %load = load i8, i8 addrspace(5)* %gep2
%sext = sext i8 %load to i32
store i32 %sext, i32 addrspace(1)* %out
ret void
@@ -209,26 +210,26 @@ entry:
define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
entry:
- %alloca = alloca [2 x [2 x i32]]
- %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
- %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
- store i32 0, i32* %gep0
- store i32 1, i32* %gep1
- %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
- %load = load i32, i32* %gep2
+ %alloca = alloca [2 x [2 x i32]], addrspace(5)
+ %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1
+ store i32 0, i32 addrspace(5)* %gep0
+ store i32 1, i32 addrspace(5)* %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index
+ %load = load i32, i32 addrspace(5)* %gep2
store i32 %load, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
entry:
- %alloca = alloca [2 x [2 x i64]]
- %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
- %gep1 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
- store i64 0, i64* %gep0
- store i64 1, i64* %gep1
- %gep2 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
- %load = load i64, i64* %gep2
+ %alloca = alloca [2 x [2 x i64]], addrspace(5)
+ %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 1
+ store i64 0, i64 addrspace(5)* %gep0
+ store i64 1, i64 addrspace(5)* %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index
+ %load = load i64, i64 addrspace(5)* %gep2
store i64 %load, i64 addrspace(1)* %out
ret void
}
@@ -237,40 +238,40 @@ entry:
define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
entry:
- %alloca = alloca [2 x [2 x %struct.pair32]]
- %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
- %gep1 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
- store i32 0, i32* %gep0
- store i32 1, i32* %gep1
- %gep2 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
- %load = load i32, i32* %gep2
+ %alloca = alloca [2 x [2 x %struct.pair32]], addrspace(5)
+ %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0, i32 1
+ %gep1 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1, i32 1
+ store i32 0, i32 addrspace(5)* %gep0
+ store i32 1, i32 addrspace(5)* %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index, i32 0
+ %load = load i32, i32 addrspace(5)* %gep2
store i32 %load, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
entry:
- %alloca = alloca [2 x %struct.pair32]
- %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
- %gep1 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
- store i32 0, i32* %gep0
- store i32 1, i32* %gep1
- %gep2 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
- %load = load i32, i32* %gep2
+ %alloca = alloca [2 x %struct.pair32], addrspace(5)
+ %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 0, i32 1
+ %gep1 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 1, i32 0
+ store i32 0, i32 addrspace(5)* %gep0
+ store i32 1, i32 addrspace(5)* %gep1
+ %gep2 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 %index, i32 0
+ %load = load i32, i32 addrspace(5)* %gep2
store i32 %load, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
entry:
- %tmp = alloca [2 x i32]
- %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
- %tmp2 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
- store i32 0, i32* %tmp1
- store i32 1, i32* %tmp2
+ %tmp = alloca [2 x i32], addrspace(5)
+ %tmp1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0
+ %tmp2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1
+ store i32 0, i32 addrspace(5)* %tmp1
+ store i32 1, i32 addrspace(5)* %tmp2
%cmp = icmp eq i32 %in, 0
- %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
- %load = load i32, i32* %sel
+ %sel = select i1 %cmp, i32 addrspace(5)* %tmp1, i32 addrspace(5)* %tmp2
+ %load = load i32, i32 addrspace(5)* %sel
store i32 %load, i32 addrspace(1)* %out
ret void
}
@@ -283,14 +284,14 @@ entry:
; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
- %alloca = alloca [16 x i32]
- %tmp0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
- store i32 5, i32* %tmp0
- %tmp1 = ptrtoint [16 x i32]* %alloca to i32
+ %alloca = alloca [16 x i32], addrspace(5)
+ %tmp0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+ store i32 5, i32 addrspace(5)* %tmp0
+ %tmp1 = ptrtoint [16 x i32] addrspace(5)* %alloca to i32
%tmp2 = add i32 %tmp1, 5
- %tmp3 = inttoptr i32 %tmp2 to i32*
- %tmp4 = getelementptr inbounds i32, i32* %tmp3, i32 %b
- %tmp5 = load i32, i32* %tmp4
+ %tmp3 = inttoptr i32 %tmp2 to i32 addrspace(5)*
+ %tmp4 = getelementptr inbounds i32, i32 addrspace(5)* %tmp3, i32 %b
+ %tmp5 = load i32, i32 addrspace(5)* %tmp4
store i32 %tmp5, i32 addrspace(1)* %out
ret void
}
diff --git a/test/CodeGen/AMDGPU/simplify-libcalls.ll b/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 47eb9a9a3d1..aa6c1833bde 100644
--- a/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -1,11 +1,11 @@
-; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-POSTLINK %s
-; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-PRELINK %s
-; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-NATIVE %s
+; RUN: opt -S -O1 -mtriple=amdgcn---amdgiz -amdgpu-simplify-libcall <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-POSTLINK %s
+; RUN: opt -S -O1 -mtriple=amdgcn---amdgiz -amdgpu-simplify-libcall -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-PRELINK %s
+; RUN: opt -S -O1 -mtriple=amdgcn---amdgiz -amdgpu-use-native -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-NATIVE %s
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos
; GCN-POSTLINK: tail call fast float @_Z3sinf(
; GCN-POSTLINK: tail call fast float @_Z3cosf(
-; GCN-PRELINK: call fast float @_Z6sincosfPU3AS4f(
+; GCN-PRELINK: call fast float @_Z6sincosfPf(
; GCN-NATIVE: tail call fast float @_Z10native_sinf(
; GCN-NATIVE: tail call fast float @_Z10native_cosf(
define amdgpu_kernel void @test_sincos(float addrspace(1)* nocapture %a) {
@@ -26,7 +26,7 @@ declare float @_Z3cosf(float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v2
; GCN-POSTLINK: tail call fast <2 x float> @_Z3sinDv2_f(
; GCN-POSTLINK: tail call fast <2 x float> @_Z3cosDv2_f(
-; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPU3AS4S_(
+; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPS_(
; GCN-NATIVE: tail call fast <2 x float> @_Z10native_sinDv2_f(
; GCN-NATIVE: tail call fast <2 x float> @_Z10native_cosDv2_f(
define amdgpu_kernel void @test_sincos_v2(<2 x float> addrspace(1)* nocapture %a) {
@@ -47,7 +47,7 @@ declare <2 x float> @_Z3cosDv2_f(<2 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v3
; GCN-POSTLINK: tail call fast <3 x float> @_Z3sinDv3_f(
; GCN-POSTLINK: tail call fast <3 x float> @_Z3cosDv3_f(
-; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPU3AS4S_(
+; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPS_(
; GCN-NATIVE: tail call fast <3 x float> @_Z10native_sinDv3_f(
; GCN-NATIVE: tail call fast <3 x float> @_Z10native_cosDv3_f(
define amdgpu_kernel void @test_sincos_v3(<3 x float> addrspace(1)* nocapture %a) {
@@ -73,7 +73,7 @@ declare <3 x float> @_Z3cosDv3_f(<3 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v4
; GCN-POSTLINK: tail call fast <4 x float> @_Z3sinDv4_f(
; GCN-POSTLINK: tail call fast <4 x float> @_Z3cosDv4_f(
-; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPU3AS4S_(
+; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPS_(
; GCN-NATIVE: tail call fast <4 x float> @_Z10native_sinDv4_f(
; GCN-NATIVE: tail call fast <4 x float> @_Z10native_cosDv4_f(
define amdgpu_kernel void @test_sincos_v4(<4 x float> addrspace(1)* nocapture %a) {
@@ -94,7 +94,7 @@ declare <4 x float> @_Z3cosDv4_f(<4 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v8
; GCN-POSTLINK: tail call fast <8 x float> @_Z3sinDv8_f(
; GCN-POSTLINK: tail call fast <8 x float> @_Z3cosDv8_f(
-; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPU3AS4S_(
+; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPS_(
; GCN-NATIVE: tail call fast <8 x float> @_Z10native_sinDv8_f(
; GCN-NATIVE: tail call fast <8 x float> @_Z10native_cosDv8_f(
define amdgpu_kernel void @test_sincos_v8(<8 x float> addrspace(1)* nocapture %a) {
@@ -115,7 +115,7 @@ declare <8 x float> @_Z3cosDv8_f(<8 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v16
; GCN-POSTLINK: tail call fast <16 x float> @_Z3sinDv16_f(
; GCN-POSTLINK: tail call fast <16 x float> @_Z3cosDv16_f(
-; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPU3AS4S_(
+; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPS_(
; GCN-NATIVE: tail call fast <16 x float> @_Z10native_sinDv16_f(
; GCN-NATIVE: tail call fast <16 x float> @_Z10native_cosDv16_f(
define amdgpu_kernel void @test_sincos_v16(<16 x float> addrspace(1)* nocapture %a) {
@@ -685,101 +685,101 @@ define amdgpu_kernel void @test_use_native_sincos(float addrspace(1)* %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
- %tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float addrspace(4)*
- %call = tail call fast float @_Z6sincosfPU3AS4f(float %tmp, float addrspace(4)* %tmp1)
+ %tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float*
+ %call = tail call fast float @_Z6sincosfPf(float %tmp, float* %tmp1)
store float %call, float addrspace(1)* %a, align 4
ret void
}
-declare float @_Z6sincosfPU3AS4f(float, float addrspace(4)*)
+declare float @_Z6sincosfPf(float, float*)
%opencl.pipe_t = type opaque
%opencl.reserve_id_t = type opaque
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr)
-; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND:[0-9]+]]
-; GCN-PRELINK: call i32 @__read_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 2, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32* %{{.*}}) #[[NOUNWIND:[0-9]+]]
+; GCN-PRELINK: call i32 @__read_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t addrspace(5)* %{{.*}}, i32 2, i32* %{{.*}}) #[[NOUNWIND]]
define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr {
entry:
%tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
- %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8 addrspace(4)*
- %tmp2 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
- %tmp3 = tail call %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4)
- %tmp4 = tail call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 2, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
- tail call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 4, i32 4)
+ %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8*
+ %tmp2 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
+ %tmp3 = tail call %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4)
+ %tmp4 = tail call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
+ tail call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4)
ret void
}
-declare i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)*, i8 addrspace(4)*, i32, i32)
+declare i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)*, i8*, i32, i32)
-declare %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32)
+declare %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32)
-declare i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i8 addrspace(4)*, i32, i32)
+declare i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i8*, i32, i32)
-declare void @__commit_read_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i32)
+declare void @__commit_read_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i32)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr)
-; GCN-PRELINK: call i32 @__write_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__write_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 2, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__write_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__write_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t addrspace(5)* %{{.*}}, i32 2, i32* %{{.*}}) #[[NOUNWIND]]
define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr {
entry:
%tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
- %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8 addrspace(4)*
- %tmp2 = tail call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
- %tmp3 = tail call %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0
- %tmp4 = tail call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 2, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
- tail call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 4, i32 4) #0
+ %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8*
+ %tmp2 = tail call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
+ %tmp3 = tail call %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0
+ %tmp4 = tail call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
+ tail call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4) #0
ret void
}
-declare i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)*, i8 addrspace(4)*, i32, i32) local_unnamed_addr
+declare i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)*, i8*, i32, i32) local_unnamed_addr
-declare %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) local_unnamed_addr
+declare %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) local_unnamed_addr
-declare i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i8 addrspace(4)*, i32, i32) local_unnamed_addr
+declare i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i8*, i32, i32) local_unnamed_addr
-declare void @__commit_write_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i32) local_unnamed_addr
+declare void @__commit_write_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i32) local_unnamed_addr
%struct.S = type { [100 x i32] }
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pipe_size
-; GCN-PRELINK: call i32 @__read_pipe_2_1(%opencl.pipe_t addrspace(1)* %{{.*}} i8 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_2(%opencl.pipe_t addrspace(1)* %{{.*}} i16 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}} i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_8(%opencl.pipe_t addrspace(1)* %{{.*}} i64 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_16(%opencl.pipe_t addrspace(1)* %{{.*}}, <2 x i64> addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_32(%opencl.pipe_t addrspace(1)* %{{.*}}, <4 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_64(%opencl.pipe_t addrspace(1)* %{{.*}}, <8 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_128(%opencl.pipe_t addrspace(1)* %{{.*}}, <16 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %{{.*}}, i8 addrspace(4)* %{{.*}} i32 400, i32 4) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_1(%opencl.pipe_t addrspace(1)* %{{.*}} i8* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_2(%opencl.pipe_t addrspace(1)* %{{.*}} i16* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}} i32* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_8(%opencl.pipe_t addrspace(1)* %{{.*}} i64* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_16(%opencl.pipe_t addrspace(1)* %{{.*}}, <2 x i64>* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_32(%opencl.pipe_t addrspace(1)* %{{.*}}, <4 x i64>* %{{.*}} #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_64(%opencl.pipe_t addrspace(1)* %{{.*}}, <8 x i64>* %{{.*}} #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_128(%opencl.pipe_t addrspace(1)* %{{.*}}, <16 x i64>* %{{.*}} #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %{{.*}}, i8* %{{.*}} i32 400, i32 4) #[[NOUNWIND]]
define amdgpu_kernel void @test_pipe_size(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(1)* %ptr1, %opencl.pipe_t addrspace(1)* %p2, i16 addrspace(1)* %ptr2, %opencl.pipe_t addrspace(1)* %p4, i32 addrspace(1)* %ptr4, %opencl.pipe_t addrspace(1)* %p8, i64 addrspace(1)* %ptr8, %opencl.pipe_t addrspace(1)* %p16, <2 x i64> addrspace(1)* %ptr16, %opencl.pipe_t addrspace(1)* %p32, <4 x i64> addrspace(1)* %ptr32, %opencl.pipe_t addrspace(1)* %p64, <8 x i64> addrspace(1)* %ptr64, %opencl.pipe_t addrspace(1)* %p128, <16 x i64> addrspace(1)* %ptr128, %opencl.pipe_t addrspace(1)* %pu, %struct.S addrspace(1)* %ptru) local_unnamed_addr #0 {
entry:
- %tmp = addrspacecast i8 addrspace(1)* %ptr1 to i8 addrspace(4)*
- %tmp1 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(4)* %tmp, i32 1, i32 1) #0
+ %tmp = addrspacecast i8 addrspace(1)* %ptr1 to i8*
+ %tmp1 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8* %tmp, i32 1, i32 1) #0
%tmp2 = bitcast i16 addrspace(1)* %ptr2 to i8 addrspace(1)*
- %tmp3 = addrspacecast i8 addrspace(1)* %tmp2 to i8 addrspace(4)*
- %tmp4 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8 addrspace(4)* %tmp3, i32 2, i32 2) #0
+ %tmp3 = addrspacecast i8 addrspace(1)* %tmp2 to i8*
+ %tmp4 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8* %tmp3, i32 2, i32 2) #0
%tmp5 = bitcast i32 addrspace(1)* %ptr4 to i8 addrspace(1)*
- %tmp6 = addrspacecast i8 addrspace(1)* %tmp5 to i8 addrspace(4)*
- %tmp7 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8 addrspace(4)* %tmp6, i32 4, i32 4) #0
+ %tmp6 = addrspacecast i8 addrspace(1)* %tmp5 to i8*
+ %tmp7 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8* %tmp6, i32 4, i32 4) #0
%tmp8 = bitcast i64 addrspace(1)* %ptr8 to i8 addrspace(1)*
- %tmp9 = addrspacecast i8 addrspace(1)* %tmp8 to i8 addrspace(4)*
- %tmp10 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8 addrspace(4)* %tmp9, i32 8, i32 8) #0
+ %tmp9 = addrspacecast i8 addrspace(1)* %tmp8 to i8*
+ %tmp10 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8* %tmp9, i32 8, i32 8) #0
%tmp11 = bitcast <2 x i64> addrspace(1)* %ptr16 to i8 addrspace(1)*
- %tmp12 = addrspacecast i8 addrspace(1)* %tmp11 to i8 addrspace(4)*
- %tmp13 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8 addrspace(4)* %tmp12, i32 16, i32 16) #0
+ %tmp12 = addrspacecast i8 addrspace(1)* %tmp11 to i8*
+ %tmp13 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8* %tmp12, i32 16, i32 16) #0
%tmp14 = bitcast <4 x i64> addrspace(1)* %ptr32 to i8 addrspace(1)*
- %tmp15 = addrspacecast i8 addrspace(1)* %tmp14 to i8 addrspace(4)*
- %tmp16 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8 addrspace(4)* %tmp15, i32 32, i32 32) #0
+ %tmp15 = addrspacecast i8 addrspace(1)* %tmp14 to i8*
+ %tmp16 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8* %tmp15, i32 32, i32 32) #0
%tmp17 = bitcast <8 x i64> addrspace(1)* %ptr64 to i8 addrspace(1)*
- %tmp18 = addrspacecast i8 addrspace(1)* %tmp17 to i8 addrspace(4)*
- %tmp19 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8 addrspace(4)* %tmp18, i32 64, i32 64) #0
+ %tmp18 = addrspacecast i8 addrspace(1)* %tmp17 to i8*
+ %tmp19 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8* %tmp18, i32 64, i32 64) #0
%tmp20 = bitcast <16 x i64> addrspace(1)* %ptr128 to i8 addrspace(1)*
- %tmp21 = addrspacecast i8 addrspace(1)* %tmp20 to i8 addrspace(4)*
- %tmp22 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8 addrspace(4)* %tmp21, i32 128, i32 128) #0
+ %tmp21 = addrspacecast i8 addrspace(1)* %tmp20 to i8*
+ %tmp22 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8* %tmp21, i32 128, i32 128) #0
%tmp23 = bitcast %struct.S addrspace(1)* %ptru to i8 addrspace(1)*
- %tmp24 = addrspacecast i8 addrspace(1)* %tmp23 to i8 addrspace(4)*
- %tmp25 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8 addrspace(4)* %tmp24, i32 400, i32 4) #0
+ %tmp24 = addrspacecast i8 addrspace(1)* %tmp23 to i8*
+ %tmp25 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8* %tmp24, i32 400, i32 4) #0
ret void
}
diff --git a/test/CodeGen/AMDGPU/unknown-processor.ll b/test/CodeGen/AMDGPU/unknown-processor.ll
index e25f2235993..6dfcff77d81 100644
--- a/test/CodeGen/AMDGPU/unknown-processor.ll
+++ b/test/CodeGen/AMDGPU/unknown-processor.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
-; RUN: llc -march=r600 -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
+; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s
+target datalayout = "A5"
; Should not crash when the processor is not recognized and the
; wavefront size feature not set.
@@ -14,7 +15,7 @@
; R600: MOV
define amdgpu_kernel void @foo() {
- %alloca = alloca i32, align 4
- store volatile i32 0, i32* %alloca
+ %alloca = alloca i32, align 4, addrspace(5)
+ store volatile i32 0, i32 addrspace(5)* %alloca
ret void
}
diff --git a/test/CodeGen/AMDGPU/unsupported-calls.ll b/test/CodeGen/AMDGPU/unsupported-calls.ll
index 990b25e0c59..68872c54f7f 100644
--- a/test/CodeGen/AMDGPU/unsupported-calls.ll
+++ b/test/CodeGen/AMDGPU/unsupported-calls.ll
@@ -1,5 +1,5 @@
-; RUN: not llc -march=amdgcn -tailcallopt < %s 2>&1 | FileCheck -check-prefix=GCN %s
-; RUN: not llc -march=r600 -mcpu=cypress -tailcallopt < %s 2>&1 | FileCheck -check-prefix=R600 %s
+; RUN: not llc -march=amdgcn -mtriple=amdgcn---amdgiz -tailcallopt < %s 2>&1 | FileCheck -check-prefix=GCN %s
+; RUN: not llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress -tailcallopt < %s 2>&1 | FileCheck -check-prefix=R600 %s
declare i32 @external_function(i32) nounwind
diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir
index d96463f00c7..939c851584c 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir
@@ -1,6 +1,7 @@
# RUN: llc -O0 -mtriple arm-- -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
--- |
define void @test_mla() #0 { ret void }
+ define void @test_mla_commutative() #0 { ret void }
define void @test_mla_v5() #1 { ret void }
define void @test_mls() #2 { ret void }
@@ -45,6 +46,40 @@ body: |
; CHECK: BX_RET 14, _, implicit %r0
...
---
+name: test_mla_commutative
+# CHECK-LABEL: name: test_mla_commutative
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: gprb }
+ - { id: 1, class: gprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+ - { id: 4, class: gprb }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ ; CHECK: [[VREGX:%[0-9]+]]:gprnopc = COPY %r0
+ ; CHECK: [[VREGY:%[0-9]+]]:gprnopc = COPY %r1
+ ; CHECK: [[VREGZ:%[0-9]+]]:gprnopc = COPY %r2
+
+ %3(s32) = G_MUL %0, %1
+ %4(s32) = G_ADD %2, %3
+ ; CHECK: [[VREGR:%[0-9]+]]:gprnopc = MLA [[VREGX]], [[VREGY]], [[VREGZ]], 14, _, _
+
+ %r0 = COPY %4(s32)
+ ; CHECK: %r0 = COPY [[VREGR]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
name: test_mla_v5
# CHECK-LABEL: name: test_mla_v5
legalized: true
diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
index 0fdd485ba90..588ceaca2c4 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
@@ -970,9 +970,10 @@ registers:
- { id: 1, class: gprb }
- { id: 2, class: gprb }
- { id: 3, class: gprb }
+ - { id: 4, class: gprb }
body: |
bb.0:
- liveins: %r0, %r1
+ liveins: %r0, %r1, %r2
%0(p0) = COPY %r0
; CHECK: [[VREGX:%[0-9]+]]:gpr = COPY %r0
@@ -980,14 +981,17 @@ body: |
%1(p0) = COPY %r1
; CHECK: [[VREGY:%[0-9]+]]:gpr = COPY %r1
- %2(s1) = G_TRUNC %1(p0)
- ; CHECK: [[VREGC:%[0-9]+]]:gpr = COPY [[VREGY]]
+ %2(s32) = COPY %r2
+ ; CHECK: [[VREGC:%[0-9]+]]:gpr = COPY %r2
- %3(p0) = G_SELECT %2(s1), %0, %1
- ; CHECK: CMPri [[VREGC]], 0, 14, _, implicit-def %cpsr
+ %3(s1) = G_TRUNC %2(s32)
+ ; CHECK: [[VREGD:%[0-9]+]]:gpr = COPY [[VREGC]]
+
+ %4(p0) = G_SELECT %3(s1), %0, %1
+ ; CHECK: CMPri [[VREGD]], 0, 14, _, implicit-def %cpsr
; CHECK: [[RES:%[0-9]+]]:gpr = MOVCCr [[VREGX]], [[VREGY]], 0, %cpsr
- %r0 = COPY %3(p0)
+ %r0 = COPY %4(p0)
; CHECK: %r0 = COPY [[RES]]
BX_RET 14, _, implicit %r0
diff --git a/test/CodeGen/Generic/llc-start-stop.ll b/test/CodeGen/Generic/llc-start-stop.ll
index 85b69c37aa0..9056e2cab49 100644
--- a/test/CodeGen/Generic/llc-start-stop.ll
+++ b/test/CodeGen/Generic/llc-start-stop.ll
@@ -13,15 +13,15 @@
; STOP-BEFORE-NOT: Loop Strength Reduction
; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER
-; START-AFTER: -machine-branch-prob -gc-lowering
+; START-AFTER: -machine-branch-prob -expandmemcmp
; START-AFTER: FunctionPass Manager
-; START-AFTER-NEXT: Lower Garbage Collection Instructions
+; START-AFTER-NEXT: Expand memcmp() to load/stores
; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE
; START-BEFORE: -machine-branch-prob -domtree
; START-BEFORE: FunctionPass Manager
; START-BEFORE: Loop Strength Reduction
-; START-BEFORE-NEXT: Lower Garbage Collection Instructions
+; START-BEFORE-NEXT: Expand memcmp() to load/stores
; RUN: not llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE
; RUN: not llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE
diff --git a/test/CodeGen/Hexagon/isel-prefer.ll b/test/CodeGen/Hexagon/isel-prefer.ll
index 062b0b3a0ea..7094544f54b 100644
--- a/test/CodeGen/Hexagon/isel-prefer.ll
+++ b/test/CodeGen/Hexagon/isel-prefer.ll
@@ -54,4 +54,14 @@ b2:
ret i32 %v6
}
+; CHECK-LABEL: Prefer_L2_loadrub_io:
+; CHECK: memub(r0+#65)
+define i64 @Prefer_L2_loadrub_io(i8* %a0) #0 {
+b1:
+ %v2 = getelementptr i8, i8* %a0, i32 65
+ %v3 = load i8, i8* %v2
+ %v4 = zext i8 %v3 to i64
+ ret i64 %v4
+}
+
attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/MIR/X86/subregister-index-operands.mir b/test/CodeGen/MIR/X86/subregister-index-operands.mir
index e3c5b9d17ee..4d8b24608b7 100644
--- a/test/CodeGen/MIR/X86/subregister-index-operands.mir
+++ b/test/CodeGen/MIR/X86/subregister-index-operands.mir
@@ -22,9 +22,9 @@ body: |
liveins: %edi, %eax
; CHECK-LABEL: name: t
; CHECK: liveins: %edi, %eax
- ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:gr32 = INSERT_SUBREG %edi, %al, 1
- ; CHECK: [[EXTRACT_SUBREG:%[0-9]+]]:gr8 = EXTRACT_SUBREG %eax, 2
- ; CHECK: %ax = REG_SEQUENCE [[EXTRACT_SUBREG]], 1, [[EXTRACT_SUBREG]], 2
+ ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:gr32 = INSERT_SUBREG %edi, %al, %subreg.sub_8bit
+ ; CHECK: [[EXTRACT_SUBREG:%[0-9]+]]:gr8 = EXTRACT_SUBREG %eax, %subreg.sub_8bit_hi
+ ; CHECK: %ax = REG_SEQUENCE [[EXTRACT_SUBREG]], %subreg.sub_8bit, [[EXTRACT_SUBREG]], %subreg.sub_8bit_hi
; CHECK: RETQ %ax
%0 = INSERT_SUBREG %edi, %al, %subreg.sub_8bit
%1 = EXTRACT_SUBREG %eax, %subreg.sub_8bit_hi
diff --git a/test/CodeGen/Mips/brind-tailcall.ll b/test/CodeGen/Mips/brind-tailcall.ll
new file mode 100644
index 00000000000..78fb0f15107
--- /dev/null
+++ b/test/CodeGen/Mips/brind-tailcall.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=pic < %s 2>&1 | FileCheck --check-prefix=PIC %s
+; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=static < %s 2>&1 | FileCheck --check-prefix=STATIC %s
+; RUN: llc -march=mips64 -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=pic < %s 2>&1 | FileCheck --check-prefix=PIC64 %s
+; RUN: llc -march=mips64 -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=static < %s 2>&1 | FileCheck --check-prefix=STATIC64 %s
+; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=pic -mattr=+micromips < %s 2>&1 | FileCheck --check-prefix=PIC %s
+; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=static -mattr=+micromips < %s 2>&1 | FileCheck --check-prefix=STATIC-MM %s
+; RUN: llc -march=mips -mcpu=mips32r6 -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=pic -mattr=+micromips < %s 2>&1 | FileCheck --check-prefix=PIC %s
+; RUN: llc -march=mips -mcpu=mips32r6 -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=static -mattr=+micromips < %s 2>&1 | FileCheck --check-prefix=STATIC-MM %s
+; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=pic -mattr=+mips16 < %s 2>&1 | FileCheck --check-prefix=MIPS16 %s
+; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=static -mattr=+mips16 < %s 2>&1 | FileCheck --check-prefix=MIPS16 %s
+
+; REQUIRES: asserts
+
+; Test that the correct pseudo instructions are generated for indirect
+; branches and tail calls. Previously, the order of the DAG matcher table
+; determined if the correct instruction was selected for mips16.
+
+declare protected void @a()
+
+define void @test1(i32 %a) {
+entry:
+ %0 = trunc i32 %a to i1
+ %1 = select i1 %0,
+ i8* blockaddress(@test1, %bb),
+ i8* blockaddress(@test1, %bb6)
+ indirectbr i8* %1, [label %bb, label %bb6]
+
+; STATIC: PseudoIndirectBranch
+; STATIC-MM: PseudoIndirectBranch
+; STATIC-NOT: PseudoIndirectBranch64
+; STATIC64: PseudoIndirectBranch64
+; PIC: PseudoIndirectBranch
+; PIC-NOT: PseudoIndirectBranch64
+; PIC64: PseudoIndirectBranch64
+; MIPS16: JrcRx16
+bb:
+ ret void
+
+bb6:
+ tail call void @a()
+
+; STATIC: TAILCALL
+; STATIC-NOT: TAILCALL_MM
+; STATIC-MM: TAILCALL_MM
+; PIC: TAILCALLREG
+; PIC-NOT: TAILCALLREG64
+; PIC64: TAILCALLREG64
+; MIPS16: RetRA16
+ ret void
+}
diff --git a/test/CodeGen/Mips/dins.ll b/test/CodeGen/Mips/dins.ll
index 8a8b377861a..2f7138ca4c5 100644
--- a/test/CodeGen/Mips/dins.ll
+++ b/test/CodeGen/Mips/dins.ll
@@ -1,7 +1,11 @@
-; RUN: llc -O2 -march=mips64 -mcpu=mips64r2 -target-abi=n64 < %s -o - | FileCheck %s -check-prefix=MIPS64R2
-; RUN: llc -O2 -march=mips -mcpu=mips32r2 < %s -o - | FileCheck %s -check-prefix=MIPS32R2
-; RUN: llc -O2 -march=mips -mattr=mips16 < %s -o - | FileCheck %s -check-prefix=MIPS16
-; RUN: llc -O2 -march=mips64 -mcpu=mips64r2 -target-abi=n32 < %s -o - | FileCheck %s -check-prefix=MIPS64R2N32
+; RUN: llc -O2 -verify-machineinstrs -march=mips64 -mcpu=mips64r2 \
+; RUN: -target-abi=n64 < %s -o - | FileCheck %s -check-prefix=MIPS64R2
+; RUN: llc -O2 -verify-machineinstrs -march=mips -mcpu=mips32r2 < %s -o - \
+; RUN: | FileCheck %s -check-prefix=MIPS32R2
+; RUN: llc -O2 -verify-machineinstrs -march=mips -mattr=mips16 < %s -o - \
+; RUN: | FileCheck %s -check-prefix=MIPS16
+; RUN: llc -O2 -verify-machineinstrs -march=mips64 -mcpu=mips64r2 \
+; RUN: -target-abi=n32 < %s -o - | FileCheck %s -check-prefix=MIPS64R2N32
; #include <stdint.h>
; #include <stdio.h>
@@ -60,7 +64,7 @@ entry:
; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 123
; MIPS64R2: dinsm $[[R0:[0-9]+]], $[[R1:[0-9]+]], 27, 37
; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 4
-; MIPS64R2: dins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 28, 6
+; MIPS64R2: dinsm $[[R0:[0-9]+]], $[[R1:[0-9]+]], 28, 6
; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 5
; MIPS64R2: dinsu $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50, 14
; MIPS64R2: dsrl $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50
diff --git a/test/CodeGen/Mips/msa/emergency-spill.mir b/test/CodeGen/Mips/msa/emergency-spill.mir
new file mode 100644
index 00000000000..502b60f673e
--- /dev/null
+++ b/test/CodeGen/Mips/msa/emergency-spill.mir
@@ -0,0 +1,221 @@
+# RUN: llc %s -start-after=shrink-wrap -march=mips64 -mcpu=mips64r6 -mattr=+fp64,+msa -o /dev/null
+
+# Test that estimated size of the stack leads to the creation of an emergency
+# spill when MSA is in use. Previously, this test case would fail during
+# register scavenging due to the lack of a spill slot.
+--- |
+ define inreg { i64, i64 } @test(i64 inreg %a.coerce0, i64 inreg %a.coerce1, i64 inreg %b.coerce0, i64 inreg %b.coerce1, i32 signext %c) #0 {
+ entry:
+ %retval = alloca <16 x i8>, align 16
+ %a = alloca <16 x i8>, align 16
+ %b = alloca <16 x i8>, align 16
+ %a.addr = alloca <16 x i8>, align 16
+ %b.addr = alloca <16 x i8>, align 16
+ %c.addr = alloca i32, align 4
+ %g = alloca <16 x i8>*, align 8
+ %d = alloca i8*, align 8
+ %0 = bitcast <16 x i8>* %a to { i64, i64 }*
+ %1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i32 0, i32 0
+ store i64 %a.coerce0, i64* %1, align 16
+ %2 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i32 0, i32 1
+ store i64 %a.coerce1, i64* %2, align 8
+ %a1 = load <16 x i8>, <16 x i8>* %a, align 16
+ %3 = bitcast <16 x i8>* %b to { i64, i64 }*
+ %4 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %3, i32 0, i32 0
+ store i64 %b.coerce0, i64* %4, align 16
+ %5 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %3, i32 0, i32 1
+ store i64 %b.coerce1, i64* %5, align 8
+ %b2 = load <16 x i8>, <16 x i8>* %b, align 16
+ store <16 x i8> %a1, <16 x i8>* %a.addr, align 16
+ store <16 x i8> %b2, <16 x i8>* %b.addr, align 16
+ store i32 %c, i32* %c.addr, align 4
+ %6 = alloca i8, i64 6400, align 16
+ %7 = bitcast i8* %6 to <16 x i8>*
+ store <16 x i8>* %7, <16 x i8>** %g, align 8
+ %8 = load <16 x i8>*, <16 x i8>** %g, align 8
+ call void @h(<16 x i8>* %b.addr, <16 x i8>* %8)
+ %9 = load <16 x i8>*, <16 x i8>** %g, align 8
+ %10 = bitcast <16 x i8>* %9 to i8*
+ store i8* %10, i8** %d, align 8
+ %11 = load <16 x i8>, <16 x i8>* %a.addr, align 16
+ %12 = load i8*, i8** %d, align 8
+ %arrayidx = getelementptr inbounds i8, i8* %12, i64 0
+ %13 = load i8, i8* %arrayidx, align 1
+ %conv = sext i8 %13 to i32
+ %14 = call <16 x i8> @llvm.mips.fill.b(i32 %conv)
+ %add = add <16 x i8> %11, %14
+ %15 = load i8*, i8** %d, align 8
+ %arrayidx3 = getelementptr inbounds i8, i8* %15, i64 1
+ %16 = load i8, i8* %arrayidx3, align 1
+ %conv4 = sext i8 %16 to i32
+ %17 = call <16 x i8> @llvm.mips.fill.b(i32 %conv4)
+ %add5 = add <16 x i8> %add, %17
+ %18 = load <16 x i8>, <16 x i8>* %b.addr, align 16
+ %add6 = add <16 x i8> %18, %add5
+ store <16 x i8> %add6, <16 x i8>* %b.addr, align 16
+ %19 = load <16 x i8>, <16 x i8>* %b.addr, align 16
+ store <16 x i8> %19, <16 x i8>* %retval, align 16
+ %20 = bitcast <16 x i8>* %retval to { i64, i64 }*
+ %21 = load { i64, i64 }, { i64, i64 }* %20, align 16
+ ret { i64, i64 } %21
+ }
+
+ declare void @h(<16 x i8>*, <16 x i8>*)
+
+ declare <16 x i8> @llvm.mips.fill.b(i32)
+
+ declare void @llvm.stackprotector(i8*, i8**)
+
+...
+---
+name: test
+alignment: 3
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+liveins:
+ - { reg: '%a0_64', virtual-reg: '' }
+ - { reg: '%a1_64', virtual-reg: '' }
+ - { reg: '%a2_64', virtual-reg: '' }
+ - { reg: '%a3_64', virtual-reg: '' }
+ - { reg: '%t0_64', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 16
+ adjustsStack: false
+ hasCalls: true
+ stackProtector: ''
+ maxCallFrameSize: 4294967295
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+ - { id: 0, name: retval, type: default, offset: 0, size: 16, alignment: 16,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 1, name: a, type: default, offset: 0, size: 16, alignment: 16,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 2, name: b, type: default, offset: 0, size: 16, alignment: 16,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 3, name: a.addr, type: default, offset: 0, size: 16, alignment: 16,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 4, name: b.addr, type: default, offset: 0, size: 16, alignment: 16,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 5, name: c.addr, type: default, offset: 0, size: 4, alignment: 4,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 6, name: g, type: default, offset: 0, size: 8, alignment: 8,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 7, name: d, type: default, offset: 0, size: 8, alignment: 8,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 8, name: '', type: default, offset: 0, size: 6400,
+ alignment: 16, stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+constants:
+body: |
+ bb.0.entry:
+ liveins: %a0_64, %a1_64, %a2_64, %a3_64, %t0_64
+
+ SD killed %a0_64, %stack.1.a, 0 :: (store 8 into %ir.1, align 16)
+ SD killed %a1_64, %stack.1.a, 8 :: (store 8 into %ir.2)
+ %w0 = LD_B %stack.1.a, 0 :: (dereferenceable load 16 from %ir.a)
+ SD killed %a2_64, %stack.2.b, 0 :: (store 8 into %ir.4, align 16)
+ SD killed %a3_64, %stack.2.b, 8 :: (store 8 into %ir.5)
+ %w1 = LD_B %stack.2.b, 0 :: (dereferenceable load 16 from %ir.b)
+ ST_B killed %w0, %stack.3.a.addr, 0 :: (store 16 into %ir.a.addr)
+ ST_B killed %w1, %stack.4.b.addr, 0 :: (store 16 into %ir.b.addr)
+ SW %t0, %stack.5.c.addr, 0, implicit killed %t0_64 :: (store 4 into %ir.c.addr)
+ %at_64 = LEA_ADDiu64 %stack.8, 0
+ SD killed %at_64, %stack.6.g, 0 :: (store 8 into %ir.g)
+ %a1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp
+ %a0_64 = LEA_ADDiu64 %stack.4.b.addr, 0
+ JAL @h, csr_n64, implicit-def dead %ra, implicit %a0_64, implicit %a1_64, implicit-def %sp
+ ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp
+ %at_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %v0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %v1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %a0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %a1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %a2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %a3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t4_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t5_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t6_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t7_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s4_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s5_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s6_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s7_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t8_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t9_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %ra_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %w0 = LD_B %stack.3.a.addr, 0 :: (dereferenceable load 16 from %ir.a.addr)
+ SD %at_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %v0_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %v1_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %a0_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %a1_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %a2_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %a3_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t0_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t1_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t2_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t3_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t4_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t5_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t6_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t7_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s0_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s1_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s2_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s3_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s4_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s5_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s6_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s7_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t8_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t9_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %ra_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ %at_64 = LD %stack.7.d, 0 :: (dereferenceable load 8 from %ir.d)
+ %v0 = LB %at_64, 0 :: (load 1 from %ir.arrayidx)
+ %w1 = FILL_B killed %v0
+ %w0 = ADDV_B killed %w0, killed %w1
+ %at = LB killed %at_64, 1 :: (load 1 from %ir.arrayidx3)
+ %w1 = FILL_B killed %at
+ %w0 = ADDV_B killed %w0, killed %w1
+ %w1 = LD_B %stack.4.b.addr, 0 :: (dereferenceable load 16 from %ir.b.addr)
+ %w0 = ADDV_B killed %w1, killed %w0
+ ST_B killed %w0, %stack.4.b.addr, 0 :: (store 16 into %ir.b.addr)
+ %w0 = LD_B %stack.4.b.addr, 0 :: (dereferenceable load 16 from %ir.b.addr)
+ ST_B killed %w0, %stack.0.retval, 0 :: (store 16 into %ir.retval)
+ %v0_64 = LD %stack.0.retval, 0 :: (dereferenceable load 8 from %ir.20, align 16)
+ %v1_64 = LD %stack.0.retval, 8 :: (dereferenceable load 8 from %ir.20 + 8, align 16)
+ RetRA implicit %v0_64, implicit %v1_64
+
+...
diff --git a/test/CodeGen/Mips/msa/frameindex.ll b/test/CodeGen/Mips/msa/frameindex.ll
index f903381f9ef..9c2228d3bf6 100644
--- a/test/CodeGen/Mips/msa/frameindex.ll
+++ b/test/CodeGen/Mips/msa/frameindex.ll
@@ -18,7 +18,8 @@ define void @loadstore_v16i8_just_under_simm10() nounwind {
; MIPS32-AE: loadstore_v16i8_just_under_simm10:
%1 = alloca <16 x i8>
- %2 = alloca [496 x i8] ; Push the frame right up to 512 bytes
+ %2 = alloca [492 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 512 bytes
%3 = load volatile <16 x i8>, <16 x i8>* %1
; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 496($sp)
@@ -33,7 +34,8 @@ define void @loadstore_v16i8_just_over_simm10() nounwind {
; MIPS32-AE: loadstore_v16i8_just_over_simm10:
%1 = alloca <16 x i8>
- %2 = alloca [497 x i8] ; Push the frame just over 512 bytes
+ %2 = alloca [497 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 512 bytes
%3 = load volatile <16 x i8>, <16 x i8>* %1
; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 512
@@ -50,7 +52,8 @@ define void @loadstore_v16i8_just_under_simm16() nounwind {
; MIPS32-AE: loadstore_v16i8_just_under_simm16:
%1 = alloca <16 x i8>
- %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+ %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 32768 bytes
%3 = load volatile <16 x i8>, <16 x i8>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@@ -69,7 +72,8 @@ define void @loadstore_v16i8_just_over_simm16() nounwind {
; MIPS32-AE: loadstore_v16i8_just_over_simm16:
%1 = alloca <16 x i8>
- %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+ %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--just over 32768 bytes
%3 = load volatile <16 x i8>, <16 x i8>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@@ -121,7 +125,8 @@ define void @loadstore_v8i16_just_under_simm10() nounwind {
; MIPS32-AE: loadstore_v8i16_just_under_simm10:
%1 = alloca <8 x i16>
- %2 = alloca [1008 x i8] ; Push the frame right up to 1024 bytes
+ %2 = alloca [1004 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 1024 bytes
%3 = load volatile <8 x i16>, <8 x i16>* %1
; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 1008($sp)
@@ -136,7 +141,8 @@ define void @loadstore_v8i16_just_over_simm10() nounwind {
; MIPS32-AE: loadstore_v8i16_just_over_simm10:
%1 = alloca <8 x i16>
- %2 = alloca [1009 x i8] ; Push the frame just over 1024 bytes
+ %2 = alloca [1009 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--just over 1024 bytes
%3 = load volatile <8 x i16>, <8 x i16>* %1
; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1024
@@ -153,7 +159,8 @@ define void @loadstore_v8i16_just_under_simm16() nounwind {
; MIPS32-AE: loadstore_v8i16_just_under_simm16:
%1 = alloca <8 x i16>
- %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+ %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 32768 bytes
%3 = load volatile <8 x i16>, <8 x i16>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@@ -172,7 +179,8 @@ define void @loadstore_v8i16_just_over_simm16() nounwind {
; MIPS32-AE: loadstore_v8i16_just_over_simm16:
%1 = alloca <8 x i16>
- %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+ %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--just over 32768 bytes
%3 = load volatile <8 x i16>, <8 x i16>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@@ -224,7 +232,8 @@ define void @loadstore_v4i32_just_under_simm10() nounwind {
; MIPS32-AE: loadstore_v4i32_just_under_simm10:
%1 = alloca <4 x i32>
- %2 = alloca [2032 x i8] ; Push the frame right up to 2048 bytes
+ %2 = alloca [2028 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 2048 bytes
%3 = load volatile <4 x i32>, <4 x i32>* %1
; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 2032($sp)
@@ -239,7 +248,8 @@ define void @loadstore_v4i32_just_over_simm10() nounwind {
; MIPS32-AE: loadstore_v4i32_just_over_simm10:
%1 = alloca <4 x i32>
- %2 = alloca [2033 x i8] ; Push the frame just over 2048 bytes
+ %2 = alloca [2033 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--just over 2048 bytes
%3 = load volatile <4 x i32>, <4 x i32>* %1
; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 2048
@@ -256,7 +266,8 @@ define void @loadstore_v4i32_just_under_simm16() nounwind {
; MIPS32-AE: loadstore_v4i32_just_under_simm16:
%1 = alloca <4 x i32>
- %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+ %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot-- right up to 32768 bytes
%3 = load volatile <4 x i32>, <4 x i32>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@@ -275,7 +286,8 @@ define void @loadstore_v4i32_just_over_simm16() nounwind {
; MIPS32-AE: loadstore_v4i32_just_over_simm16:
%1 = alloca <4 x i32>
- %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+ %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--just over 32768 bytes
%3 = load volatile <4 x i32>, <4 x i32>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@@ -327,8 +339,8 @@ define void @loadstore_v2i64_just_under_simm10() nounwind {
; MIPS32-AE: loadstore_v2i64_just_under_simm10:
%1 = alloca <2 x i64>
- %2 = alloca [4080 x i8] ; Push the frame right up to 4096 bytes
-
+ %2 = alloca [4076 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 4096 bytes
%3 = load volatile <2 x i64>, <2 x i64>* %1
; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 4080($sp)
store volatile <2 x i64> %3, <2 x i64>* %1
@@ -342,7 +354,8 @@ define void @loadstore_v2i64_just_over_simm10() nounwind {
; MIPS32-AE: loadstore_v2i64_just_over_simm10:
%1 = alloca <2 x i64>
- %2 = alloca [4081 x i8] ; Push the frame just over 4096 bytes
+ %2 = alloca [4081 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--just over 4096 bytes
%3 = load volatile <2 x i64>, <2 x i64>* %1
; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 4096
@@ -359,7 +372,8 @@ define void @loadstore_v2i64_just_under_simm16() nounwind {
; MIPS32-AE: loadstore_v2i64_just_under_simm16:
%1 = alloca <2 x i64>
- %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+ %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 32768 bytes
%3 = load volatile <2 x i64>, <2 x i64>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@@ -378,7 +392,8 @@ define void @loadstore_v2i64_just_over_simm16() nounwind {
; MIPS32-AE: loadstore_v2i64_just_over_simm16:
%1 = alloca <2 x i64>
- %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+ %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--just over 32768 bytes
%3 = load volatile <2 x i64>, <2 x i64>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
diff --git a/test/CodeGen/Mips/tailcall/tailcall.ll b/test/CodeGen/Mips/tailcall/tailcall.ll
index 3f04e1cf305..1c81335937d 100644
--- a/test/CodeGen/Mips/tailcall/tailcall.ll
+++ b/test/CodeGen/Mips/tailcall/tailcall.ll
@@ -27,7 +27,7 @@
; RUN: llc -march=mipsel -relocation-model=pic -mcpu=mips32r6 -mattr=+micromips \
; RUN: -mips-tail-calls=1 < %s | FileCheck %s -check-prefixes=ALL,PIC32MM
; RUN: llc -march=mipsel -relocation-model=static -mcpu=mips32r6 \
-; RUN: -mattr=+micromips -mips-tail-calls=1 < %s | FileCheck %s -check-prefixes=ALL,STATIC32
+; RUN: -mattr=+micromips -mips-tail-calls=1 < %s | FileCheck %s -check-prefixes=ALL,STATIC32MMR6
; RUN: llc -march=mips64el -relocation-model=pic -mcpu=mips64r6 \
; RUN: -mattr=+micromips -mips-tail-calls=1 < %s | FileCheck %s -check-prefix=PIC64R6MM
; RUN: llc -march=mips64el -relocation-model=static -mcpu=mips64r6 \
@@ -51,6 +51,7 @@ entry:
; PIC32MM: jalr $25
; PIC32R6: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; N64: jalr $25
; N64R6: jalr $25
; PIC16: jalrc
@@ -68,6 +69,7 @@ entry:
; PIC32MM: jalr $25
; PIC32R6: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; N64: jalr $25
; N64R6: jalr $25
; PIC16: jalrc
@@ -85,6 +87,7 @@ entry:
; PIC32R6: jalr $25
; PIC32MM: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; N64: jalr $25
; N64R6: jalr $25
; PIC16: jalrc
@@ -102,6 +105,7 @@ entry:
; PIC32R6: jalr $25
; PIC32MM: jalr $25
; STATIC32: jal
+; SATATIC32MMR6: jal
; PIC64: jalr $25
; STATIC64: jal
; N64R6: jalr $25
@@ -120,6 +124,7 @@ entry:
; PIC32R6: jr $25
; PIC32MM: jr
; STATIC32: j
+; STATIC32MMR6: bc
; PIC64: jr $25
; STATIC64: j
; PIC16: jalrc
@@ -161,6 +166,7 @@ entry:
; PIC32R6: jrc $25
; PIC32MM: jrc
; STATIC32: j
+; STATIC32MMR6: bc
; PIC64: jr $25
; PIC64R6: jrc $25
; PIC64R6MM: jr $25
@@ -178,6 +184,7 @@ entry:
; PIC32R6: jalr $25
; PIC32MM: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; PIC64: jalr $25
; STATIC64: jal
; PIC16: jalrc
@@ -199,6 +206,7 @@ entry:
; PIC32R6: jrc $25
; PIC32MM: jrc
; STATIC32: j
+; STATIC32MMR6: bc
; PIC64: jr $25
; STATIC64: j
; PIC64R6: jrc $25
@@ -214,6 +222,7 @@ entry:
; PIC32R6: jalrc $25
; PIC32MM: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; STATIC64: jal
; PIC64: jalr $25
; PIC64R6: jalrc $25
@@ -232,6 +241,7 @@ entry:
; PIC32R6: jalr $25
; PIC32MM: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; STATIC64: jal
; PIC64: jalr $25
; PIC64R6: jalr $25
@@ -250,6 +260,7 @@ entry:
; PIC32R6: jalrc $25
; PIC32MM: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; STATIC64: jal
; PIC64: jalr $25
; PIC64R6: jalrc $25
@@ -270,6 +281,7 @@ entry:
; PIC32R6: jalrc $25
; PIC32MM: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; STATIC64: jal
; PIC64: jalr $25
; PIC64R6: jalrc $25
@@ -290,6 +302,7 @@ entry:
; PIC32R6: jalr $25
; PIC32MM: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; STATIC64: jal
; PIC64R6: jalr $25
; PIC64: jalr $25
diff --git a/test/CodeGen/NVPTX/atomics-sm60.ll b/test/CodeGen/NVPTX/atomics-sm60.ll
new file mode 100644
index 00000000000..0b5bafb780c
--- /dev/null
+++ b/test/CodeGen/NVPTX/atomics-sm60.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s
+
+; CHECK-LABEL .func test(
+define void @test(double* %dp0, double addrspace(1)* %dp1, double addrspace(3)* %dp3, double %d) {
+; CHECK: atom.add.f64
+ %r1 = call double @llvm.nvvm.atomic.load.add.f64.p0f64(double* %dp0, double %d)
+; CHECK: atom.global.add.f64
+ %r2 = call double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* %dp1, double %d)
+; CHECK: atom.shared.add.f64
+ %ret = call double @llvm.nvvm.atomic.load.add.f64.p3f64(double addrspace(3)* %dp3, double %d)
+ ret void
+}
+
+declare double @llvm.nvvm.atomic.load.add.f64.p0f64(double* nocapture, double) #1
+declare double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* nocapture, double) #1
+declare double @llvm.nvvm.atomic.load.add.f64.p3f64(double addrspace(3)* nocapture, double) #1
+
+attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll b/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll
index f874148c0e8..5df5183dc2f 100644
--- a/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll
+++ b/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll
@@ -1,6 +1,6 @@
; Verify functionality of NVPTXGenericToNVVM.cpp pass.
;
-; RUN: opt < %s -march nvptx64 -S -generic-to-nvvm -verify-debug-info | FileCheck %s
+; RUN: opt < %s -march nvptx64 -S -generic-to-nvvm | FileCheck %s
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
diff --git a/test/CodeGen/PowerPC/bswap64.ll b/test/CodeGen/PowerPC/bswap64.ll
new file mode 100644
index 00000000000..0a78aa2dc54
--- /dev/null
+++ b/test/CodeGen/PowerPC/bswap64.ll
@@ -0,0 +1,13 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64le-- -mcpu=pwr9 | FileCheck %s
+
+declare i64 @llvm.bswap.i64(i64)
+
+; CHECK: mtvsrdd
+; CHECK: xxbrd
+; CHECK: mfvsrd
+define i64 @bswap64(i64 %x) {
+entry:
+ %0 = call i64 @llvm.bswap.i64(i64 %x)
+ ret i64 %0
+}
+
diff --git a/test/CodeGen/PowerPC/p9-vinsert-vextract.ll b/test/CodeGen/PowerPC/p9-vinsert-vextract.ll
index 31bbc4b1351..c8c7d797c00 100644
--- a/test/CodeGen/PowerPC/p9-vinsert-vextract.ll
+++ b/test/CodeGen/PowerPC/p9-vinsert-vextract.ll
@@ -298,3 +298,825 @@ entry:
ret <8 x i16> %vecins
}
+; The following testcases take one byte element from the second vector and
+; inserts it at various locations in the first vector
+define <16 x i8> @shuffle_vector_byte_0_16(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_0_16
+; CHECK: vsldoi 3, 3, 3, 8
+; CHECK: vinsertb 2, 3, 15
+; CHECK-BE-LABEL: shuffle_vector_byte_0_16
+; CHECK-BE: vsldoi 3, 3, 3, 9
+; CHECK-BE: vinsertb 2, 3, 0
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_1_25(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_1_25
+; CHECK: vsldoi 3, 3, 3, 15
+; CHECK: vinsertb 2, 3, 14
+; CHECK-BE-LABEL: shuffle_vector_byte_1_25
+; CHECK-BE: vsldoi 3, 3, 3, 2
+; CHECK-BE: vinsertb 2, 3, 1
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 25, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_2_18(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_2_18
+; CHECK: vsldoi 3, 3, 3, 6
+; CHECK: vinsertb 2, 3, 13
+; CHECK-BE-LABEL: shuffle_vector_byte_2_18
+; CHECK-BE: vsldoi 3, 3, 3, 11
+; CHECK-BE: vinsertb 2, 3, 2
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_3_27(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_3_27
+; CHECK: vsldoi 3, 3, 3, 13
+; CHECK: vinsertb 2, 3, 12
+; CHECK-BE-LABEL: shuffle_vector_byte_3_27
+; CHECK-BE: vsldoi 3, 3, 3, 4
+; CHECK-BE: vinsertb 2, 3, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 27, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_4_20(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_4_20
+; CHECK: vsldoi 3, 3, 3, 4
+; CHECK: vinsertb 2, 3, 11
+; CHECK-BE-LABEL: shuffle_vector_byte_4_20
+; CHECK-BE: vsldoi 3, 3, 3, 13
+; CHECK-BE: vinsertb 2, 3, 4
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_5_29(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_5_29
+; CHECK: vsldoi 3, 3, 3, 11
+; CHECK: vinsertb 2, 3, 10
+; CHECK-BE-LABEL: shuffle_vector_byte_5_29
+; CHECK-BE: vsldoi 3, 3, 3, 6
+; CHECK-BE: vinsertb 2, 3, 5
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 29, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_6_22(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_6_22
+; CHECK: vsldoi 3, 3, 3, 2
+; CHECK: vinsertb 2, 3, 9
+; CHECK-BE-LABEL: shuffle_vector_byte_6_22
+; CHECK-BE: vsldoi 3, 3, 3, 15
+; CHECK-BE: vinsertb 2, 3, 6
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 22, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_7_31(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_7_31
+; CHECK: vsldoi 3, 3, 3, 9
+; CHECK: vinsertb 2, 3, 8
+; CHECK-BE-LABEL: shuffle_vector_byte_7_31
+; CHECK-BE: vsldoi 3, 3, 3, 8
+; CHECK-BE: vinsertb 2, 3, 7
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_8_24(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_8_24
+; CHECK: vinsertb 2, 3, 7
+; CHECK-BE-LABEL: shuffle_vector_byte_8_24
+; CHECK-BE: vsldoi 3, 3, 3, 1
+; CHECK-BE: vinsertb 2, 3, 8
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_9_17(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_9_17
+; CHECK: vsldoi 3, 3, 3, 7
+; CHECK: vinsertb 2, 3, 6
+; CHECK-BE-LABEL: shuffle_vector_byte_9_17
+; CHECK-BE: vsldoi 3, 3, 3, 10
+; CHECK-BE: vinsertb 2, 3, 9
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_10_26(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_10_26
+; CHECK: vsldoi 3, 3, 3, 14
+; CHECK: vinsertb 2, 3, 5
+; CHECK-BE-LABEL: shuffle_vector_byte_10_26
+; CHECK-BE: vsldoi 3, 3, 3, 3
+; CHECK-BE: vinsertb 2, 3, 10
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_11_19(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_11_19
+; CHECK: vsldoi 3, 3, 3, 5
+; CHECK: vinsertb 2, 3, 4
+; CHECK-BE-LABEL: shuffle_vector_byte_11_19
+; CHECK-BE: vsldoi 3, 3, 3, 12
+; CHECK-BE: vinsertb 2, 3, 11
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 19, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_12_28(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_12_28
+; CHECK: vsldoi 3, 3, 3, 12
+; CHECK: vinsertb 2, 3, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_12_28
+; CHECK-BE: vsldoi 3, 3, 3, 5
+; CHECK-BE: vinsertb 2, 3, 12
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_13_21(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_13_21
+; CHECK: vsldoi 3, 3, 3, 3
+; CHECK: vinsertb 2, 3, 2
+; CHECK-BE-LABEL: shuffle_vector_byte_13_21
+; CHECK-BE: vsldoi 3, 3, 3, 14
+; CHECK-BE: vinsertb 2, 3, 13
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 21, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_14_30(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_14_30
+; CHECK: vsldoi 3, 3, 3, 10
+; CHECK: vinsertb 2, 3, 1
+; CHECK-BE-LABEL: shuffle_vector_byte_14_30
+; CHECK-BE: vsldoi 3, 3, 3, 7
+; CHECK-BE: vinsertb 2, 3, 14
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 30, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_15_23(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_15_23
+; CHECK: vsldoi 3, 3, 3, 1
+; CHECK: vinsertb 2, 3, 0
+; CHECK-BE-LABEL: shuffle_vector_byte_15_23
+; CHECK-BE: vinsertb 2, 3, 15
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 23>
+ ret <16 x i8> %vecins
+}
+
+; The following testcases take one byte element from the first vector and
+; inserts it at various locations in the second vector
+define <16 x i8> @shuffle_vector_byte_16_8(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_16_8
+; CHECK: vinsertb 3, 2, 15
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_16_8
+; CHECK-BE: vsldoi 2, 2, 2, 1
+; CHECK-BE: vinsertb 3, 2, 0
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_17_1(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_17_1
+; CHECK: vsldoi 2, 2, 2, 7
+; CHECK: vinsertb 3, 2, 14
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_17_1
+; CHECK-BE: vsldoi 2, 2, 2, 10
+; CHECK-BE: vinsertb 3, 2, 1
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_18_10(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_18_10
+; CHECK: vsldoi 2, 2, 2, 14
+; CHECK: vinsertb 3, 2, 13
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_18_10
+; CHECK-BE: vsldoi 2, 2, 2, 3
+; CHECK-BE: vinsertb 3, 2, 2
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 10, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_19_3(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_19_3
+; CHECK: vsldoi 2, 2, 2, 5
+; CHECK: vinsertb 3, 2, 12
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_19_3
+; CHECK-BE: vsldoi 2, 2, 2, 12
+; CHECK-BE: vinsertb 3, 2, 3
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_20_12(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_20_12
+; CHECK: vsldoi 2, 2, 2, 12
+; CHECK: vinsertb 3, 2, 11
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_20_12
+; CHECK-BE: vsldoi 2, 2, 2, 5
+; CHECK-BE: vinsertb 3, 2, 4
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 12, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_21_5(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_21_5
+; CHECK: vsldoi 2, 2, 2, 3
+; CHECK: vinsertb 3, 2, 10
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_21_5
+; CHECK-BE: vsldoi 2, 2, 2, 14
+; CHECK-BE: vinsertb 3, 2, 5
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 5, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_22_14(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_22_14
+; CHECK: vsldoi 2, 2, 2, 10
+; CHECK: vinsertb 3, 2, 9
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_22_14
+; CHECK-BE: vsldoi 2, 2, 2, 7
+; CHECK-BE: vinsertb 3, 2, 6
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 14, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_23_7(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_23_7
+; CHECK: vsldoi 2, 2, 2, 1
+; CHECK: vinsertb 3, 2, 8
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_23_7
+; CHECK-BE: vinsertb 3, 2, 7
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_24_0(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_24_0
+; CHECK: vsldoi 2, 2, 2, 8
+; CHECK: vinsertb 3, 2, 7
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_24_0
+; CHECK-BE: vsldoi 2, 2, 2, 9
+; CHECK-BE: vinsertb 3, 2, 8
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_25_9(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_25_9
+; CHECK: vsldoi 2, 2, 2, 15
+; CHECK: vinsertb 3, 2, 6
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_25_9
+; CHECK-BE: vsldoi 2, 2, 2, 2
+; CHECK-BE: vinsertb 3, 2, 9
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 9, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_26_2(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_26_2
+; CHECK: vsldoi 2, 2, 2, 6
+; CHECK: vinsertb 3, 2, 5
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_26_2
+; CHECK-BE: vsldoi 2, 2, 2, 11
+; CHECK-BE: vinsertb 3, 2, 10
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 2, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_27_11(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_27_11
+; CHECK: vsldoi 2, 2, 2, 13
+; CHECK: vinsertb 3, 2, 4
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_27_11
+; CHECK-BE: vsldoi 2, 2, 2, 4
+; CHECK-BE: vinsertb 3, 2, 11
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 11, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_28_4(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_28_4
+; CHECK: vsldoi 2, 2, 2, 4
+; CHECK: vinsertb 3, 2, 3
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_28_4
+; CHECK-BE: vsldoi 2, 2, 2, 13
+; CHECK-BE: vinsertb 3, 2, 12
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 4, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_29_13(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_29_13
+; CHECK: vsldoi 2, 2, 2, 11
+; CHECK: vinsertb 3, 2, 2
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_29_13
+; CHECK-BE: vsldoi 2, 2, 2, 6
+; CHECK-BE: vinsertb 3, 2, 13
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 13, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_30_6(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_30_6
+; CHECK: vsldoi 2, 2, 2, 2
+; CHECK: vinsertb 3, 2, 1
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_30_6
+; CHECK-BE: vsldoi 2, 2, 2, 15
+; CHECK-BE: vinsertb 3, 2, 14
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 6, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_31_15(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_31_15
+; CHECK: vsldoi 2, 2, 2, 9
+; CHECK: vinsertb 3, 2, 0
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_31_15
+; CHECK-BE: vsldoi 2, 2, 2, 8
+; CHECK-BE: vinsertb 3, 2, 15
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 15>
+ ret <16 x i8> %vecins
+}
+
+; The following testcases use the same vector in both arguments of the
+; shufflevector. If byte element 7 in BE mode(or 8 in LE mode) is the one
+; we're attempting to insert, then we can use the vector insert instruction
+define <16 x i8> @shuffle_vector_byte_0_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_0_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_0_7
+; CHECK-BE: vinsertb 2, 2, 0
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_1_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_1_8
+; CHECK: vinsertb 2, 2, 14
+; CHECK-BE-LABEL: shuffle_vector_byte_1_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 8, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_2_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_2_8
+; CHECK: vinsertb 2, 2, 13
+; CHECK-BE-LABEL: shuffle_vector_byte_2_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_3_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_3_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_3_7
+; CHECK-BE: vinsertb 2, 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_4_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_4_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_4_7
+; CHECK-BE: vinsertb 2, 2, 4
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_5_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_5_8
+; CHECK: vinsertb 2, 2, 10
+; CHECK-BE-LABEL: shuffle_vector_byte_5_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_6_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_6_8
+; CHECK: vinsertb 2, 2, 9
+; CHECK-BE-LABEL: shuffle_vector_byte_6_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_7_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_7_8
+; CHECK: vinsertb 2, 2, 8
+; CHECK-BE-LABEL: shuffle_vector_byte_7_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 8, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_8_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_8_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_8_7
+; CHECK-BE: vinsertb 2, 2, 8
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_9_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_9_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_9_7
+; CHECK-BE: vinsertb 2, 2, 9
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 7, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_10_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_10_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_10_7
+; CHECK-BE: vinsertb 2, 2, 10
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 7, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_11_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_11_8
+; CHECK: vinsertb 2, 2, 4
+; CHECK-BE-LABEL: shuffle_vector_byte_11_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 8, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_12_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_12_8
+; CHECK: vinsertb 2, 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_12_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 8, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_13_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_13_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_13_7
+; CHECK-BE: vinsertb 2, 2, 13
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 7, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_14_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_14_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_14_7
+; CHECK-BE: vinsertb 2, 2, 14
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 7, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_15_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_15_8
+; CHECK: vinsertb 2, 2, 0
+; CHECK-BE-LABEL: shuffle_vector_byte_15_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 8>
+ ret <16 x i8> %vecins
+}
+
+; The following tests try to insert one halfword element into the vector. We
+; should always be using the 'vinserth' instruction.
+define <8 x i16> @insert_halfword_0(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_0
+; CHECK: vinserth 2, 3, 14
+; CHECK-BE-LABEL: insert_halfword_0
+; CHECK-BE: vinserth 2, 3, 0
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 0
+ ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_1(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_1
+; CHECK: vinserth 2, 3, 12
+; CHECK-BE-LABEL: insert_halfword_1
+; CHECK-BE: vinserth 2, 3, 2
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 1
+ ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_2(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_2
+; CHECK: vinserth 2, 3, 10
+; CHECK-BE-LABEL: insert_halfword_2
+; CHECK-BE: vinserth 2, 3, 4
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 2
+ ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_3(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_3
+; CHECK: vinserth 2, 3, 8
+; CHECK-BE-LABEL: insert_halfword_3
+; CHECK-BE: vinserth 2, 3, 6
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 3
+ ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_4(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_4
+; CHECK: vinserth 2, 3, 6
+; CHECK-BE-LABEL: insert_halfword_4
+; CHECK-BE: vinserth 2, 3, 8
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 4
+ ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_5(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_5
+; CHECK: vinserth 2, 3, 4
+; CHECK-BE-LABEL: insert_halfword_5
+; CHECK-BE: vinserth 2, 3, 10
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 5
+ ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_6(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_6
+; CHECK: vinserth 2, 3, 2
+; CHECK-BE-LABEL: insert_halfword_6
+; CHECK-BE: vinserth 2, 3, 12
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 6
+ ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_7(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_7
+; CHECK: vinserth 2, 3, 0
+; CHECK-BE-LABEL: insert_halfword_7
+; CHECK-BE: vinserth 2, 3, 14
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 7
+ ret <8 x i16> %vecins
+}
+
+; The following tests try to insert one byte element into the vector. We
+; should always be using the 'vinsertb' instruction.
+define <16 x i8> @insert_byte_0(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_0
+; CHECK: vinsertb 2, 3, 15
+; CHECK-BE-LABEL: insert_byte_0
+; CHECK-BE: vinsertb 2, 3, 0
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 0
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_1(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_1
+; CHECK: vinsertb 2, 3, 14
+; CHECK-BE-LABEL: insert_byte_1
+; CHECK-BE: vinsertb 2, 3, 1
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 1
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_2(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_2
+; CHECK: vinsertb 2, 3, 13
+; CHECK-BE-LABEL: insert_byte_2
+; CHECK-BE: vinsertb 2, 3, 2
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 2
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_3(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_3
+; CHECK: vinsertb 2, 3, 12
+; CHECK-BE-LABEL: insert_byte_3
+; CHECK-BE: vinsertb 2, 3, 3
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 3
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_4(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_4
+; CHECK: vinsertb 2, 3, 11
+; CHECK-BE-LABEL: insert_byte_4
+; CHECK-BE: vinsertb 2, 3, 4
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 4
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_5(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_5
+; CHECK: vinsertb 2, 3, 10
+; CHECK-BE-LABEL: insert_byte_5
+; CHECK-BE: vinsertb 2, 3, 5
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 5
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_6(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_6
+; CHECK: vinsertb 2, 3, 9
+; CHECK-BE-LABEL: insert_byte_6
+; CHECK-BE: vinsertb 2, 3, 6
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 6
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_7(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_7
+; CHECK: vinsertb 2, 3, 8
+; CHECK-BE-LABEL: insert_byte_7
+; CHECK-BE: vinsertb 2, 3, 7
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 7
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_8(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_8
+; CHECK: vinsertb 2, 3, 7
+; CHECK-BE-LABEL: insert_byte_8
+; CHECK-BE: vinsertb 2, 3, 8
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 8
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_9(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_9
+; CHECK: vinsertb 2, 3, 6
+; CHECK-BE-LABEL: insert_byte_9
+; CHECK-BE: vinsertb 2, 3, 9
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 9
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_10(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_10
+; CHECK: vinsertb 2, 3, 5
+; CHECK-BE-LABEL: insert_byte_10
+; CHECK-BE: vinsertb 2, 3, 10
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 10
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_11(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_11
+; CHECK: vinsertb 2, 3, 4
+; CHECK-BE-LABEL: insert_byte_11
+; CHECK-BE: vinsertb 2, 3, 11
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 11
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_12(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_12
+; CHECK: vinsertb 2, 3, 3
+; CHECK-BE-LABEL: insert_byte_12
+; CHECK-BE: vinsertb 2, 3, 12
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 12
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_13(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_13
+; CHECK: vinsertb 2, 3, 2
+; CHECK-BE-LABEL: insert_byte_13
+; CHECK-BE: vinsertb 2, 3, 13
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 13
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_14(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_14
+; CHECK: vinsertb 2, 3, 1
+; CHECK-BE-LABEL: insert_byte_14
+; CHECK-BE: vinsertb 2, 3, 14
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 14
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_15(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_15
+; CHECK: vinsertb 2, 3, 0
+; CHECK-BE-LABEL: insert_byte_15
+; CHECK-BE: vinsertb 2, 3, 15
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 15
+ ret <16 x i8> %vecins
+}
diff --git a/test/CodeGen/PowerPC/subreg-postra-2.ll b/test/CodeGen/PowerPC/subreg-postra-2.ll
index 338000cd8ba..794c9c190d1 100644
--- a/test/CodeGen/PowerPC/subreg-postra-2.ll
+++ b/test/CodeGen/PowerPC/subreg-postra-2.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gep-opt=0 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false -ppc-gep-opt=0 < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
@@ -38,10 +38,10 @@ while.end418: ; preds = %wait_on_buffer.exit
; CHECK: stdcx.
; CHECK: isel {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, [[REG]]
; CHECK-NO-ISEL: bc 12, 20, [[TRUE:.LBB[0-9]+]]
-; CHECK-NO-ISEL: ori 4, 7, 0
+; CHECK-NO-ISEL: ori 7, 8, 0
; CHECK-NO-ISEL-NEXT: b [[SUCCESSOR:.LBB[0-9]+]]
; CHECK-NO-ISEL: [[TRUE]]
-; CHECK-NO-ISEL-NEXT: addi 4, 3, 0
+; CHECK-NO-ISEL: addi 7, 3, 0
if.then420: ; preds = %while.end418
unreachable
diff --git a/test/CodeGen/RISCV/alu32.ll b/test/CodeGen/RISCV/alu32.ll
index 32242d2e40d..9db6bb9dd43 100644
--- a/test/CodeGen/RISCV/alu32.ll
+++ b/test/CodeGen/RISCV/alu32.ll
@@ -7,7 +7,6 @@ define i32 @addi(i32 %a) nounwind {
; RV32I-LABEL: addi:
; RV32I: addi a0, a0, 1
; RV32I: jalr zero, ra, 0
-; TODO: check support for materialising larger constants
%1 = add i32 %a, 1
ret i32 %1
}
diff --git a/test/CodeGen/RISCV/branch.ll b/test/CodeGen/RISCV/branch.ll
new file mode 100644
index 00000000000..194083b07c7
--- /dev/null
+++ b/test/CodeGen/RISCV/branch.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefix=RV32I %s
+
+define void @foo(i32 %a, i32 *%b, i1 %c) {
+; RV32I-LABEL: foo:
+; RV32I: # BB#0:
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: beq a3, a0, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_1
+; RV32I-NEXT: .LBB0_1: # %test2
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: bne a3, a0, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_2
+; RV32I-NEXT: .LBB0_2: # %test3
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: blt a3, a0, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_3
+; RV32I-NEXT: .LBB0_3: # %test4
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: bge a3, a0, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_4
+; RV32I-NEXT: .LBB0_4: # %test5
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: bltu a3, a0, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_5
+; RV32I-NEXT: .LBB0_5: # %test6
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: bgeu a3, a0, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_6
+; RV32I-NEXT: .LBB0_6: # %test7
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: blt a0, a3, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_7
+; RV32I-NEXT: .LBB0_7: # %test8
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: bge a0, a3, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_8
+; RV32I-NEXT: .LBB0_8: # %test9
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: bltu a0, a3, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_9
+; RV32I-NEXT: .LBB0_9: # %test10
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: bgeu a0, a3, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_10
+; RV32I-NEXT: .LBB0_10: # %test11
+; RV32I-NEXT: lw a0, 0(a1)
+; RV32I-NEXT: andi a0, a2, 1
+; RV32I-NEXT: bne a0, zero, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_11
+; RV32I-NEXT: .LBB0_11: # %test12
+; RV32I-NEXT: lw a0, 0(a1)
+; RV32I-NEXT: .LBB0_12: # %end
+; RV32I-NEXT: jalr zero, ra, 0
+
+ %val1 = load volatile i32, i32* %b
+ %tst1 = icmp eq i32 %val1, %a
+ br i1 %tst1, label %end, label %test2
+
+test2:
+ %val2 = load volatile i32, i32* %b
+ %tst2 = icmp ne i32 %val2, %a
+ br i1 %tst2, label %end, label %test3
+
+test3:
+ %val3 = load volatile i32, i32* %b
+ %tst3 = icmp slt i32 %val3, %a
+ br i1 %tst3, label %end, label %test4
+
+test4:
+ %val4 = load volatile i32, i32* %b
+ %tst4 = icmp sge i32 %val4, %a
+ br i1 %tst4, label %end, label %test5
+
+test5:
+ %val5 = load volatile i32, i32* %b
+ %tst5 = icmp ult i32 %val5, %a
+ br i1 %tst5, label %end, label %test6
+
+test6:
+ %val6 = load volatile i32, i32* %b
+ %tst6 = icmp uge i32 %val6, %a
+ br i1 %tst6, label %end, label %test7
+
+; Check for condition codes that don't have a matching instruction
+
+test7:
+ %val7 = load volatile i32, i32* %b
+ %tst7 = icmp sgt i32 %val7, %a
+ br i1 %tst7, label %end, label %test8
+
+test8:
+ %val8 = load volatile i32, i32* %b
+ %tst8 = icmp sle i32 %val8, %a
+ br i1 %tst8, label %end, label %test9
+
+test9:
+ %val9 = load volatile i32, i32* %b
+ %tst9 = icmp ugt i32 %val9, %a
+ br i1 %tst9, label %end, label %test10
+
+test10:
+ %val10 = load volatile i32, i32* %b
+ %tst10 = icmp ule i32 %val10, %a
+ br i1 %tst10, label %end, label %test11
+
+; Check the case of a branch where the condition was generated in another
+; function
+
+test11:
+ %val11 = load volatile i32, i32* %b
+ br i1 %c, label %end, label %test12
+
+test12:
+ %val12 = load volatile i32, i32* %b
+ br label %end
+
+end:
+ ret void
+}
diff --git a/test/CodeGen/RISCV/calls.ll b/test/CodeGen/RISCV/calls.ll
new file mode 100644
index 00000000000..8abe5e92a8e
--- /dev/null
+++ b/test/CodeGen/RISCV/calls.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefix=RV32I %s
+
+declare i32 @external_function(i32)
+
+define i32 @test_call_external(i32 %a) nounwind {
+; RV32I-LABEL: test_call_external:
+; RV32I: # BB#0:
+; RV32I-NEXT: sw ra, 12(s0)
+; RV32I-NEXT: lui a1, %hi(external_function)
+; RV32I-NEXT: addi a1, a1, %lo(external_function)
+; RV32I-NEXT: jalr ra, a1, 0
+; RV32I-NEXT: lw ra, 12(s0)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = call i32 @external_function(i32 %a)
+ ret i32 %1
+}
+
+define i32 @defined_function(i32 %a) nounwind {
+; RV32I-LABEL: defined_function:
+; RV32I: # BB#0:
+; RV32I-NEXT: addi a0, a0, 1
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = add i32 %a, 1
+ ret i32 %1
+}
+
+define i32 @test_call_defined(i32 %a) nounwind {
+; RV32I-LABEL: test_call_defined:
+; RV32I: # BB#0:
+; RV32I-NEXT: sw ra, 12(s0)
+; RV32I-NEXT: lui a1, %hi(defined_function)
+; RV32I-NEXT: addi a1, a1, %lo(defined_function)
+; RV32I-NEXT: jalr ra, a1, 0
+; RV32I-NEXT: lw ra, 12(s0)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = call i32 @defined_function(i32 %a) nounwind
+ ret i32 %1
+}
+
+define i32 @test_call_indirect(i32 (i32)* %a, i32 %b) nounwind {
+; RV32I-LABEL: test_call_indirect:
+; RV32I: # BB#0:
+; RV32I-NEXT: sw ra, 12(s0)
+; RV32I-NEXT: addi a2, a0, 0
+; RV32I-NEXT: addi a0, a1, 0
+; RV32I-NEXT: jalr ra, a2, 0
+; RV32I-NEXT: lw ra, 12(s0)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = call i32 %a(i32 %b)
+ ret i32 %1
+}
+
+; Ensure that calls to fastcc functions aren't rejected. Such calls may be
+; introduced when compiling with optimisation.
+
+define fastcc i32 @fastcc_function(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: fastcc_function:
+; RV32I: # BB#0:
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = add i32 %a, %b
+ ret i32 %1
+}
+
+define i32 @test_call_fastcc(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: test_call_fastcc:
+; RV32I: # BB#0:
+; RV32I-NEXT: sw ra, 12(s0)
+; RV32I-NEXT: sw s1, 8(s0)
+; RV32I-NEXT: addi s1, a0, 0
+; RV32I-NEXT: lui a0, %hi(fastcc_function)
+; RV32I-NEXT: addi a2, a0, %lo(fastcc_function)
+; RV32I-NEXT: addi a0, s1, 0
+; RV32I-NEXT: jalr ra, a2, 0
+; RV32I-NEXT: addi a0, s1, 0
+; RV32I-NEXT: lw s1, 8(s0)
+; RV32I-NEXT: lw ra, 12(s0)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = call fastcc i32 @fastcc_function(i32 %a, i32 %b)
+ ret i32 %a
+}
diff --git a/test/CodeGen/RISCV/imm.ll b/test/CodeGen/RISCV/imm.ll
new file mode 100644
index 00000000000..c52638da02e
--- /dev/null
+++ b/test/CodeGen/RISCV/imm.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s -check-prefix=RV32I
+
+; Materializing constants
+
+define i32 @zero() nounwind {
+; RV32I-LABEL: zero:
+; RV32I: # BB#0:
+; RV32I-NEXT: addi a0, zero, 0
+; RV32I-NEXT: jalr zero, ra, 0
+ ret i32 0
+}
+
+define i32 @pos_small() nounwind {
+; RV32I-LABEL: pos_small:
+; RV32I: # BB#0:
+; RV32I-NEXT: addi a0, zero, 2047
+; RV32I-NEXT: jalr zero, ra, 0
+ ret i32 2047
+}
+
+define i32 @neg_small() nounwind {
+; RV32I-LABEL: neg_small:
+; RV32I: # BB#0:
+; RV32I-NEXT: addi a0, zero, -2048
+; RV32I-NEXT: jalr zero, ra, 0
+ ret i32 -2048
+}
+
+define i32 @pos_i32() nounwind {
+; RV32I-LABEL: pos_i32:
+; RV32I: # BB#0:
+; RV32I-NEXT: lui a0, 423811
+; RV32I-NEXT: addi a0, a0, -1297
+; RV32I-NEXT: jalr zero, ra, 0
+ ret i32 1735928559
+}
+
+define i32 @neg_i32() nounwind {
+; RV32I-LABEL: neg_i32:
+; RV32I: # BB#0:
+; RV32I-NEXT: lui a0, 912092
+; RV32I-NEXT: addi a0, a0, -273
+; RV32I-NEXT: jalr zero, ra, 0
+ ret i32 -559038737
+}
diff --git a/test/CodeGen/RISCV/mem.ll b/test/CodeGen/RISCV/mem.ll
new file mode 100644
index 00000000000..b06382f8742
--- /dev/null
+++ b/test/CodeGen/RISCV/mem.ll
@@ -0,0 +1,202 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s -check-prefix=RV32I
+
+; Check indexed and unindexed, sext, zext and anyext loads
+
+define i32 @lb(i8 *%a) nounwind {
+; RV32I-LABEL: lb:
+; RV32I: # BB#0:
+; RV32I-NEXT: lb a1, 0(a0)
+; RV32I-NEXT: lb a0, 1(a0)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = getelementptr i8, i8* %a, i32 1
+ %2 = load i8, i8* %1
+ %3 = sext i8 %2 to i32
+ ; the unused load will produce an anyext for selection
+ %4 = load volatile i8, i8* %a
+ ret i32 %3
+}
+
+define i32 @lh(i16 *%a) nounwind {
+; RV32I-LABEL: lh:
+; RV32I: # BB#0:
+; RV32I-NEXT: lh a1, 0(a0)
+; RV32I-NEXT: lh a0, 4(a0)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = getelementptr i16, i16* %a, i32 2
+ %2 = load i16, i16* %1
+ %3 = sext i16 %2 to i32
+ ; the unused load will produce an anyext for selection
+ %4 = load volatile i16, i16* %a
+ ret i32 %3
+}
+
+define i32 @lw(i32 *%a) nounwind {
+; RV32I-LABEL: lw:
+; RV32I: # BB#0:
+; RV32I-NEXT: lw a1, 0(a0)
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = getelementptr i32, i32* %a, i32 3
+ %2 = load i32, i32* %1
+ %3 = load volatile i32, i32* %a
+ ret i32 %2
+}
+
+define i32 @lbu(i8 *%a) nounwind {
+; RV32I-LABEL: lbu:
+; RV32I: # BB#0:
+; RV32I-NEXT: lbu a1, 0(a0)
+; RV32I-NEXT: lbu a0, 4(a0)
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = getelementptr i8, i8* %a, i32 4
+ %2 = load i8, i8* %1
+ %3 = zext i8 %2 to i32
+ %4 = load volatile i8, i8* %a
+ %5 = zext i8 %4 to i32
+ %6 = add i32 %3, %5
+ ret i32 %6
+}
+
+define i32 @lhu(i16 *%a) nounwind {
+; RV32I-LABEL: lhu:
+; RV32I: # BB#0:
+; RV32I-NEXT: lhu a1, 0(a0)
+; RV32I-NEXT: lhu a0, 10(a0)
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = getelementptr i16, i16* %a, i32 5
+ %2 = load i16, i16* %1
+ %3 = zext i16 %2 to i32
+ %4 = load volatile i16, i16* %a
+ %5 = zext i16 %4 to i32
+ %6 = add i32 %3, %5
+ ret i32 %6
+}
+
+; Check indexed and unindexed stores
+
+define void @sb(i8 *%a, i8 %b) nounwind {
+; RV32I-LABEL: sb:
+; RV32I: # BB#0:
+; RV32I-NEXT: sb a1, 6(a0)
+; RV32I-NEXT: sb a1, 0(a0)
+; RV32I-NEXT: jalr zero, ra, 0
+ store i8 %b, i8* %a
+ %1 = getelementptr i8, i8* %a, i32 6
+ store i8 %b, i8* %1
+ ret void
+}
+
+define void @sh(i16 *%a, i16 %b) nounwind {
+; RV32I-LABEL: sh:
+; RV32I: # BB#0:
+; RV32I-NEXT: sh a1, 14(a0)
+; RV32I-NEXT: sh a1, 0(a0)
+; RV32I-NEXT: jalr zero, ra, 0
+ store i16 %b, i16* %a
+ %1 = getelementptr i16, i16* %a, i32 7
+ store i16 %b, i16* %1
+ ret void
+}
+
+define void @sw(i32 *%a, i32 %b) nounwind {
+; RV32I-LABEL: sw:
+; RV32I: # BB#0:
+; RV32I-NEXT: sw a1, 32(a0)
+; RV32I-NEXT: sw a1, 0(a0)
+; RV32I-NEXT: jalr zero, ra, 0
+ store i32 %b, i32* %a
+ %1 = getelementptr i32, i32* %a, i32 8
+ store i32 %b, i32* %1
+ ret void
+}
+
+; Check load and store to an i1 location
+define i32 @load_sext_zext_anyext_i1(i1 *%a) nounwind {
+; RV32I-LABEL: load_sext_zext_anyext_i1:
+; RV32I: # BB#0:
+; RV32I-NEXT: lb a1, 0(a0)
+; RV32I-NEXT: lbu a1, 1(a0)
+; RV32I-NEXT: lbu a0, 2(a0)
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: jalr zero, ra, 0
+ ; sextload i1
+ %1 = getelementptr i1, i1* %a, i32 1
+ %2 = load i1, i1* %1
+ %3 = sext i1 %2 to i32
+ ; zextload i1
+ %4 = getelementptr i1, i1* %a, i32 2
+ %5 = load i1, i1* %4
+ %6 = zext i1 %5 to i32
+ %7 = add i32 %3, %6
+ ; extload i1 (anyext). Produced as the load is unused.
+ %8 = load volatile i1, i1* %a
+ ret i32 %7
+}
+
+define i16 @load_sext_zext_anyext_i1_i16(i1 *%a) nounwind {
+; RV32I-LABEL: load_sext_zext_anyext_i1_i16:
+; RV32I: # BB#0:
+; RV32I-NEXT: lb a1, 0(a0)
+; RV32I-NEXT: lbu a1, 1(a0)
+; RV32I-NEXT: lbu a0, 2(a0)
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: jalr zero, ra, 0
+ ; sextload i1
+ %1 = getelementptr i1, i1* %a, i32 1
+ %2 = load i1, i1* %1
+ %3 = sext i1 %2 to i16
+ ; zextload i1
+ %4 = getelementptr i1, i1* %a, i32 2
+ %5 = load i1, i1* %4
+ %6 = zext i1 %5 to i16
+ %7 = add i16 %3, %6
+ ; extload i1 (anyext). Produced as the load is unused.
+ %8 = load volatile i1, i1* %a
+ ret i16 %7
+}
+
+; Check load and store to a global
+@G = global i32 0
+
+define i32 @lw_sw_global(i32 %a) nounwind {
+; TODO: the addi should be folded in to the lw/sw operations
+; RV32I-LABEL: lw_sw_global:
+; RV32I: # BB#0:
+; RV32I-NEXT: lui a1, %hi(G)
+; RV32I-NEXT: addi a2, a1, %lo(G)
+; RV32I-NEXT: lw a1, 0(a2)
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: lui a2, %hi(G+36)
+; RV32I-NEXT: addi a2, a2, %lo(G+36)
+; RV32I-NEXT: lw a3, 0(a2)
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: addi a0, a1, 0
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = load volatile i32, i32* @G
+ store i32 %a, i32* @G
+ %2 = getelementptr i32, i32* @G, i32 9
+ %3 = load volatile i32, i32* %2
+ store i32 %a, i32* %2
+ ret i32 %1
+}
+
+; Ensure that 1 is added to the high 20 bits if bit 11 of the low part is 1
+define i32 @lw_sw_constant(i32 %a) nounwind {
+; TODO: the addi should be folded in to the lw/sw
+; RV32I-LABEL: lw_sw_constant:
+; RV32I: # BB#0:
+; RV32I-NEXT: lui a1, 912092
+; RV32I-NEXT: addi a2, a1, -273
+; RV32I-NEXT: lw a1, 0(a2)
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: addi a0, a1, 0
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = inttoptr i32 3735928559 to i32*
+ %2 = load volatile i32, i32* %1
+ store i32 %a, i32* %1
+ ret i32 %2
+}
diff --git a/test/CodeGen/RISCV/wide-mem.ll b/test/CodeGen/RISCV/wide-mem.ll
new file mode 100644
index 00000000000..18ab52aaf13
--- /dev/null
+++ b/test/CodeGen/RISCV/wide-mem.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s -check-prefix=RV32I
+
+; Check load/store operations on values wider than what is natively supported
+
+define i64 @load_i64(i64 *%a) nounwind {
+; RV32I-LABEL: load_i64:
+; RV32I: # BB#0:
+; RV32I-NEXT: lw a2, 0(a0)
+; RV32I-NEXT: lw a1, 4(a0)
+; RV32I-NEXT: addi a0, a2, 0
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = load i64, i64* %a
+ ret i64 %1
+}
+
+@val64 = local_unnamed_addr global i64 2863311530, align 8
+
+; TODO: codegen on this should be improved. It shouldn't be necessary to
+; generate two addi
+define i64 @load_i64_global() nounwind {
+; RV32I-LABEL: load_i64_global:
+; RV32I: # BB#0:
+; RV32I-NEXT: lui a0, %hi(val64)
+; RV32I-NEXT: addi a0, a0, %lo(val64)
+; RV32I-NEXT: lw a0, 0(a0)
+; RV32I-NEXT: lui a1, %hi(val64+4)
+; RV32I-NEXT: addi a1, a1, %lo(val64+4)
+; RV32I-NEXT: lw a1, 0(a1)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = load i64, i64* @val64
+ ret i64 %1
+}
diff --git a/test/CodeGen/WebAssembly/inline-asm-m.ll b/test/CodeGen/WebAssembly/inline-asm-m.ll
new file mode 100644
index 00000000000..8d514a528fd
--- /dev/null
+++ b/test/CodeGen/WebAssembly/inline-asm-m.ll
@@ -0,0 +1,13 @@
+; RUN: not llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -no-integrated-as
+
+; Test basic inline assembly "m" operands, which are unsupported. Pass
+; -no-integrated-as since these aren't actually valid assembly syntax.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown-wasm"
+
+define void @bar(i32* %r, i32* %s) {
+entry:
+ tail call void asm sideeffect "# $0 = bbb($1)", "=*m,*m"(i32* %s, i32* %r) #0, !srcloc !1
+ ret void
+}
diff --git a/test/CodeGen/WebAssembly/inline-asm.ll b/test/CodeGen/WebAssembly/inline-asm.ll
index 56576305d9e..760b0ad0de6 100644
--- a/test/CodeGen/WebAssembly/inline-asm.ll
+++ b/test/CodeGen/WebAssembly/inline-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -no-integrated-as | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -no-integrated-as | FileCheck %s
; Test basic inline assembly. Pass -no-integrated-as since these aren't
; actually valid assembly syntax.
@@ -10,33 +10,24 @@ target triple = "wasm32-unknown-unknown-wasm"
; CHECK-NEXT: .param i32{{$}}
; CHECK-NEXT: .result i32{{$}}
; CHECK-NEXT: #APP{{$}}
-; CHECK-NEXT: # $0 = aaa($0){{$}}
+; CHECK-NEXT: # 0 = aaa(0){{$}}
; CHECK-NEXT: #NO_APP{{$}}
-; CHECK-NEXT: return $0{{$}}
+; CHECK-NEXT: get_local $push0=, 0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
define i32 @foo(i32 %r) {
entry:
%0 = tail call i32 asm sideeffect "# $0 = aaa($1)", "=r,r"(i32 %r) #0, !srcloc !0
ret i32 %0
}
-; CHECK-LABEL: bar:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: #APP{{$}}
-; CHECK-NEXT: # 0($1) = bbb(0($0)){{$}}
-; CHECK-NEXT: #NO_APP{{$}}
-; CHECK-NEXT: return{{$}}
-define void @bar(i32* %r, i32* %s) {
-entry:
- tail call void asm sideeffect "# $0 = bbb($1)", "=*m,*m"(i32* %s, i32* %r) #0, !srcloc !1
- ret void
-}
-
; CHECK-LABEL: imm:
; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .local i32{{$}}
; CHECK-NEXT: #APP{{$}}
-; CHECK-NEXT: # $0 = ccc(42){{$}}
+; CHECK-NEXT: # 0 = ccc(42){{$}}
; CHECK-NEXT: #NO_APP{{$}}
-; CHECK-NEXT: return $0{{$}}
+; CHECK-NEXT: get_local $push0=, 0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
define i32 @imm() {
entry:
%0 = tail call i32 asm sideeffect "# $0 = ccc($1)", "=r,i"(i32 42) #0, !srcloc !2
@@ -47,9 +38,10 @@ entry:
; CHECK-NEXT: .param i64{{$}}
; CHECK-NEXT: .result i64{{$}}
; CHECK-NEXT: #APP{{$}}
-; CHECK-NEXT: # $0 = aaa($0){{$}}
+; CHECK-NEXT: # 0 = aaa(0){{$}}
; CHECK-NEXT: #NO_APP{{$}}
-; CHECK-NEXT: return $0{{$}}
+; CHECK-NEXT: get_local $push0=, 0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
define i64 @foo_i64(i64 %r) {
entry:
%0 = tail call i64 asm sideeffect "# $0 = aaa($1)", "=r,r"(i64 %r) #0, !srcloc !0
@@ -57,16 +49,20 @@ entry:
}
; CHECK-LABEL: X_i16:
-; CHECK: foo $1{{$}}
-; CHECK: i32.store16 0($0), $1{{$}}
+; CHECK: foo 1{{$}}
+; CHECK: get_local $push[[S0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[S1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.store16 0($pop[[S0]]), $pop[[S1]]{{$}}
define void @X_i16(i16 * %t) {
call void asm sideeffect "foo $0", "=*X,~{dirflag},~{fpsr},~{flags},~{memory}"(i16* %t)
ret void
}
; CHECK-LABEL: X_ptr:
-; CHECK: foo $1{{$}}
-; CHECK: i32.store 0($0), $1{{$}}
+; CHECK: foo 1{{$}}
+; CHECK: get_local $push[[S0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[S1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.store 0($pop[[S0]]), $pop[[S1]]{{$}}
define void @X_ptr(i16 ** %t) {
call void asm sideeffect "foo $0", "=*X,~{dirflag},~{fpsr},~{flags},~{memory}"(i16** %t)
ret void
@@ -87,6 +83,20 @@ define void @varname() {
ret void
}
+; CHECK-LABEL: r_constraint
+; CHECK: i32.const $push[[S0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: set_local [[L0:[0-9]+]], $pop[[S0]]{{$}}
+; CHECK-NEXT: i32.const $push[[S1:[0-9]+]]=, 37{{$}}
+; CHECK-NEXT: set_local [[L1:[0-9]+]], $pop[[S1]]{{$}}
+; CHECK: foo [[L2:[0-9]+]], 1, [[L0]], [[L1]]{{$}}
+; CHECK: get_local $push{{[0-9]+}}=, [[L2]]{{$}}
+define hidden i32 @r_constraint(i32 %a, i32 %y) {
+entry:
+ %z = bitcast i32 0 to i32
+ %t0 = tail call i32 asm "foo $0, $1, $2, $3", "=r,r,r,r"(i32 %y, i32 %z, i32 37) #0, !srcloc !0
+ ret i32 %t0
+}
+
attributes #0 = { nounwind }
!0 = !{i32 47}
diff --git a/test/CodeGen/WebAssembly/signext-arg.ll b/test/CodeGen/WebAssembly/signext-arg.ll
new file mode 100644
index 00000000000..cd116c645b4
--- /dev/null
+++ b/test/CodeGen/WebAssembly/signext-arg.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=wasm32 | FileCheck %s
+
+declare i32 @get_int(i16 %arg)
+
+define i32 @func_1(i16 %arg1 , i32 %arg2) #0 {
+; CHECK-LABEL: func_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: i32.const $push1=, 16
+; CHECK-NEXT: i32.shl $push2=, $0, $pop1
+; CHECK-NEXT: i32.const $push4=, 16
+; CHECK-NEXT: i32.shr_s $push3=, $pop2, $pop4
+; CHECK-NEXT: i32.call $push0=, get_int@FUNCTION, $pop3
+; CHECK-NEXT: # fallthrough-return: $pop0
+; CHECK-NEXT: .endfunc
+entry:
+ %retval = call i32 @get_int(i16 signext %arg1)
+ ret i32 %retval
+}
+
+attributes #0 = {noinline nounwind optnone}
+
diff --git a/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll b/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll
index 6814ed1d894..109962c2859 100644
--- a/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll
+++ b/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll
@@ -23,6 +23,7 @@ lpad: ; preds = %cont, %entry
}
; CHECK: lpad
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: Ltmp
declare i32 @__gxx_personality_v0(...)
diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index 416761ffef4..dd059100503 100644
--- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -88,6 +88,7 @@ define void @full_test() {
; X32-NEXT: movss %xmm4, {{[0-9]+}}(%esp)
; X32-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: addl $60, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: full_test:
diff --git a/test/CodeGen/X86/GlobalISel/add-scalar.ll b/test/CodeGen/X86/GlobalISel/add-scalar.ll
index 64a6313023b..9d28f441fb7 100644
--- a/test/CodeGen/X86/GlobalISel/add-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/add-scalar.ll
@@ -20,6 +20,7 @@ define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
; X32-NEXT: addl 8(%ebp), %eax
; X32-NEXT: adcl 12(%ebp), %edx
; X32-NEXT: popl %ebp
+; X32-NEXT: .cfi_def_cfa %esp, 4
; X32-NEXT: retl
%ret = add i64 %arg1, %arg2
ret i64 %ret
diff --git a/test/CodeGen/X86/GlobalISel/brcond.ll b/test/CodeGen/X86/GlobalISel/brcond.ll
index 917ee6f5bd8..2467344776e 100644
--- a/test/CodeGen/X86/GlobalISel/brcond.ll
+++ b/test/CodeGen/X86/GlobalISel/brcond.ll
@@ -36,6 +36,7 @@ define i32 @test_1(i32 %a, i32 %b, i32 %tValue, i32 %fValue) {
; X32-NEXT: movl %eax, (%esp)
; X32-NEXT: movl (%esp), %eax
; X32-NEXT: popl %ecx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
%retval = alloca i32, align 4
diff --git a/test/CodeGen/X86/GlobalISel/callingconv.ll b/test/CodeGen/X86/GlobalISel/callingconv.ll
index 4100a7217ac..23987a3c365 100644
--- a/test/CodeGen/X86/GlobalISel/callingconv.ll
+++ b/test/CodeGen/X86/GlobalISel/callingconv.ll
@@ -117,6 +117,7 @@ define <8 x i32> @test_v8i32_args(<8 x i32> %arg1, <8 x i32> %arg2) {
; X32-NEXT: movups 16(%esp), %xmm1
; X32-NEXT: movaps %xmm2, %xmm0
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_v8i32_args:
@@ -135,6 +136,7 @@ define void @test_trivial_call() {
; X32-NEXT: .cfi_def_cfa_offset 16
; X32-NEXT: calll trivial_callee
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_trivial_call:
@@ -143,6 +145,7 @@ define void @test_trivial_call() {
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: callq trivial_callee
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
call void @trivial_callee()
ret void
@@ -160,6 +163,7 @@ define void @test_simple_arg_call(i32 %in0, i32 %in1) {
; X32-NEXT: movl %eax, 4(%esp)
; X32-NEXT: calll simple_arg_callee
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_simple_arg_call:
@@ -171,6 +175,7 @@ define void @test_simple_arg_call(i32 %in0, i32 %in1) {
; X64-NEXT: movl %eax, %esi
; X64-NEXT: callq simple_arg_callee
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
call void @simple_arg_callee(i32 %in1, i32 %in0)
ret void
@@ -193,6 +198,7 @@ define void @test_simple_arg8_call(i32 %in0) {
; X32-NEXT: movl %eax, 28(%esp)
; X32-NEXT: calll simple_arg8_callee
; X32-NEXT: addl $44, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_simple_arg8_call:
@@ -208,6 +214,7 @@ define void @test_simple_arg8_call(i32 %in0) {
; X64-NEXT: movl %edi, %r9d
; X64-NEXT: callq simple_arg8_callee
; X64-NEXT: addq $24, %rsp
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
call void @simple_arg8_callee(i32 %in0, i32 %in0, i32 %in0, i32 %in0,i32 %in0, i32 %in0, i32 %in0, i32 %in0)
ret void
@@ -224,6 +231,7 @@ define i32 @test_simple_return_callee() {
; X32-NEXT: calll simple_return_callee
; X32-NEXT: addl %eax, %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_simple_return_callee:
@@ -234,6 +242,7 @@ define i32 @test_simple_return_callee() {
; X64-NEXT: callq simple_return_callee
; X64-NEXT: addl %eax, %eax
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%call = call i32 @simple_return_callee(i32 5)
%r = add i32 %call, %call
@@ -254,6 +263,7 @@ define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) {
; X32-NEXT: paddd (%esp), %xmm0 # 16-byte Folded Reload
; X32-NEXT: paddd 16(%esp), %xmm1 # 16-byte Folded Reload
; X32-NEXT: addl $44, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_split_return_callee:
@@ -268,6 +278,7 @@ define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) {
; X64-NEXT: paddd (%rsp), %xmm0 # 16-byte Folded Reload
; X64-NEXT: paddd 16(%rsp), %xmm1 # 16-byte Folded Reload
; X64-NEXT: addq $40, %rsp
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%call = call <8 x i32> @split_return_callee(<8 x i32> %arg2)
%r = add <8 x i32> %arg1, %call
@@ -281,6 +292,7 @@ define void @test_indirect_call(void()* %func) {
; X32-NEXT: .cfi_def_cfa_offset 16
; X32-NEXT: calll *16(%esp)
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_indirect_call:
@@ -289,6 +301,7 @@ define void @test_indirect_call(void()* %func) {
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: callq *%rdi
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
call void %func()
ret void
@@ -317,8 +330,11 @@ define void @test_abi_exts_call(i8* %addr) {
; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: calll take_char
; X32-NEXT: addl $4, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_abi_exts_call:
@@ -335,6 +351,7 @@ define void @test_abi_exts_call(i8* %addr) {
; X64-NEXT: movl %ebx, %edi
; X64-NEXT: callq take_char
; X64-NEXT: popq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%val = load i8, i8* %addr
call void @take_char(i8 %val)
@@ -357,6 +374,7 @@ define void @test_variadic_call_1(i8** %addr_ptr, i32* %val_ptr) {
; X32-NEXT: movl %ecx, 4(%esp)
; X32-NEXT: calll variadic_callee
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_variadic_call_1:
@@ -368,6 +386,7 @@ define void @test_variadic_call_1(i8** %addr_ptr, i32* %val_ptr) {
; X64-NEXT: movb $0, %al
; X64-NEXT: callq variadic_callee
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%addr = load i8*, i8** %addr_ptr
@@ -393,6 +412,7 @@ define void @test_variadic_call_2(i8** %addr_ptr, double* %val_ptr) {
; X32-NEXT: movl %ecx, 4(%eax)
; X32-NEXT: calll variadic_callee
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_variadic_call_2:
@@ -405,6 +425,7 @@ define void @test_variadic_call_2(i8** %addr_ptr, double* %val_ptr) {
; X64-NEXT: movq %rcx, %xmm0
; X64-NEXT: callq variadic_callee
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%addr = load i8*, i8** %addr_ptr
diff --git a/test/CodeGen/X86/GlobalISel/frameIndex.ll b/test/CodeGen/X86/GlobalISel/frameIndex.ll
index 7b2a050f153..f260d0d707f 100644
--- a/test/CodeGen/X86/GlobalISel/frameIndex.ll
+++ b/test/CodeGen/X86/GlobalISel/frameIndex.ll
@@ -18,6 +18,7 @@ define i32* @allocai32() {
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movl %esp, %eax
; X32-NEXT: popl %ecx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X32ABI-LABEL: allocai32:
diff --git a/test/CodeGen/X86/GlobalISel/select-cmp.mir b/test/CodeGen/X86/GlobalISel/select-cmp.mir
index 9058f010f76..3457e971b8d 100644
--- a/test/CodeGen/X86/GlobalISel/select-cmp.mir
+++ b/test/CodeGen/X86/GlobalISel/select-cmp.mir
@@ -100,7 +100,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr8 = COPY %sil
; CHECK: CMP8rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -131,7 +131,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr16 = COPY %si
; CHECK: CMP16rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -162,7 +162,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr64 = COPY %rsi
; CHECK: CMP64rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -193,7 +193,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -224,7 +224,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETNEr:%[0-9]+]]:gr8 = SETNEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETNEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETNEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -255,7 +255,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETAr:%[0-9]+]]:gr8 = SETAr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -286,7 +286,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETAEr:%[0-9]+]]:gr8 = SETAEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -317,7 +317,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETBr:%[0-9]+]]:gr8 = SETBr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -348,7 +348,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETBEr:%[0-9]+]]:gr8 = SETBEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -379,7 +379,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETGr:%[0-9]+]]:gr8 = SETGr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -410,7 +410,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETGEr:%[0-9]+]]:gr8 = SETGEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -441,7 +441,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETLr:%[0-9]+]]:gr8 = SETLr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -472,7 +472,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETLEr:%[0-9]+]]:gr8 = SETLEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
diff --git a/test/CodeGen/X86/GlobalISel/select-copy.mir b/test/CodeGen/X86/GlobalISel/select-copy.mir
index a72f42782c0..fccba1f8206 100644
--- a/test/CodeGen/X86/GlobalISel/select-copy.mir
+++ b/test/CodeGen/X86/GlobalISel/select-copy.mir
@@ -42,7 +42,7 @@ registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
# ALL: %0:gr8 = COPY %al
-# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
# ALL-NEXT: %1:gr32 = AND32ri8 %2, 1, implicit-def %eflags
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
@@ -146,7 +146,7 @@ regBankSelected: true
registers:
- { id: 0, class: gpr, preferred-register: '' }
# ALL: %0:gr8 = COPY %dl
-# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
body: |
@@ -170,7 +170,7 @@ regBankSelected: true
registers:
- { id: 0, class: gpr, preferred-register: '' }
# ALL: %0:gr16 = COPY %dx
-# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, 3
+# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_16bit
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
body: |
diff --git a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
index 51088e126e5..9df24f65b36 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
@@ -39,7 +39,7 @@ body: |
; ALL-LABEL: name: test_zext_i1
; ALL: [[COPY:%[0-9]+]]:gr8 = COPY %dil
; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]]
- ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 1
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_8bit
; ALL: [[AND64ri8_:%[0-9]+]]:gr64 = AND64ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; ALL: %rax = COPY [[AND64ri8_]]
; ALL: RET 0, implicit %rax
@@ -112,7 +112,7 @@ body: |
; ALL-LABEL: name: anyext_s64_from_s1
; ALL: [[COPY:%[0-9]+]]:gr64_with_sub_8bit = COPY %rdi
; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
- ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 1
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_8bit
; ALL: %rax = COPY [[SUBREG_TO_REG]]
; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
@@ -137,7 +137,7 @@ body: |
; ALL-LABEL: name: anyext_s64_from_s8
; ALL: [[COPY:%[0-9]+]]:gr64_with_sub_8bit = COPY %rdi
; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
- ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 1
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_8bit
; ALL: %rax = COPY [[SUBREG_TO_REG]]
; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
@@ -162,7 +162,7 @@ body: |
; ALL-LABEL: name: anyext_s64_from_s16
; ALL: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
; ALL: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
- ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 3
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_16bit
; ALL: %rax = COPY [[SUBREG_TO_REG]]
; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
@@ -187,7 +187,7 @@ body: |
; ALL-LABEL: name: anyext_s64_from_s32
; ALL: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
; ALL: [[COPY1:%[0-9]+]]:gr32 = COPY [[COPY]].sub_32bit
- ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 4
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_32bit
; ALL: %rax = COPY [[SUBREG_TO_REG]]
; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
diff --git a/test/CodeGen/X86/GlobalISel/select-ext.mir b/test/CodeGen/X86/GlobalISel/select-ext.mir
index 5167ee987a5..90ac0c6763a 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext.mir
@@ -85,7 +85,7 @@ registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
# ALL: %0:gr8 = COPY %dil
-# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
# ALL-NEXT: %1:gr16 = AND16ri8 %2, 1, implicit-def %eflags
# ALL-NEXT: %ax = COPY %1
# ALL-NEXT: RET 0, implicit %ax
@@ -113,7 +113,7 @@ registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
# ALL: %0:gr8 = COPY %dil
-# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
# ALL-NEXT: %1:gr32 = AND32ri8 %2, 1, implicit-def %eflags
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
@@ -288,7 +288,7 @@ registers:
# X32: %0:gr32_abcd = COPY %edi
# X64: %0:gr32 = COPY %edi
# ALL-NEXT: %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, 1
+# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, %subreg.sub_8bit
# ALL-NEXT: %ax = COPY %2
# ALL-NEXT: RET 0, implicit %ax
body: |
@@ -323,7 +323,7 @@ registers:
# X32: %0:gr32_abcd = COPY %edi
# X64: %0:gr32 = COPY %edi
# ALL-NEXT: %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, 1
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, %subreg.sub_8bit
# ALL-NEXT: %eax = COPY %2
# ALL-NEXT: RET 0, implicit %eax
body: |
@@ -358,7 +358,7 @@ registers:
# X32: %0:gr32_abcd = COPY %edi
# X64: %0:gr32 = COPY %edi
# ALL-NEXT: %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, 1
+# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, %subreg.sub_8bit
# ALL-NEXT: %ax = COPY %2
# ALL-NEXT: RET 0, implicit %ax
body: |
@@ -422,7 +422,7 @@ registers:
- { id: 2, class: gpr }
# ALL: %0:gr32 = COPY %edi
# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit
-# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, 3
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, %subreg.sub_16bit
# ALL-NEXT: %eax = COPY %2
# ALL-NEXT: RET 0, implicit %eax
body: |
diff --git a/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir b/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir
index 596c48b4922..628ab3bac4a 100644
--- a/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir
+++ b/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir
@@ -20,7 +20,7 @@ body: |
bb.0:
; CHECK-LABEL: name: read_flags
; CHECK: [[RDFLAGS32_:%[0-9]+]]:gr32 = RDFLAGS32 implicit-def %esp, implicit %esp
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[RDFLAGS32_]], 4
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[RDFLAGS32_]], %subreg.sub_32bit
; CHECK: %rax = COPY [[SUBREG_TO_REG]]
%0(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.x86.flags.read.u32)
%rax = COPY %0(s32)
diff --git a/test/CodeGen/X86/O0-pipeline.ll b/test/CodeGen/X86/O0-pipeline.ll
index 1f7415ee2af..8ecafad8022 100644
--- a/test/CodeGen/X86/O0-pipeline.ll
+++ b/test/CodeGen/X86/O0-pipeline.ll
@@ -49,6 +49,7 @@
; CHECK-NEXT: X86 pseudo instruction expansion pass
; CHECK-NEXT: Analyze Machine Code For Garbage Collection
; CHECK-NEXT: X86 vzeroupper inserter
+; CHECK-NEXT: Check CFA info and insert CFI instructions if needed
; CHECK-NEXT: Contiguously Lay Out Funclets
; CHECK-NEXT: StackMap Liveness Analysis
; CHECK-NEXT: Live DEBUG_VALUE analysis
diff --git a/test/CodeGen/X86/TruncAssertZext.ll b/test/CodeGen/X86/TruncAssertZext.ll
index b9ae57ca011..ed98fd51cc0 100644
--- a/test/CodeGen/X86/TruncAssertZext.ll
+++ b/test/CodeGen/X86/TruncAssertZext.ll
@@ -25,6 +25,7 @@ define i64 @main() {
; CHECK-NEXT: subq %rcx, %rax
; CHECK-NEXT: shrq $32, %rax
; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%b = call i64 @foo()
%or = and i64 %b, 18446744069414584575 ; this is 0xffffffff000000ff
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index 508f10e9889..14494779f10 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -2209,62 +2209,53 @@ define void @avg_v16i8_const(<16 x i8>* %a) nounwind {
define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
; SSE2-LABEL: avg_v32i8_const:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm5
-; SSE2-NEXT: movdqa 16(%rdi), %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; SSE2-NEXT: movdqa %xmm2, %xmm8
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [5,6,7,8]
-; SSE2-NEXT: paddd %xmm9, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,3,4]
-; SSE2-NEXT: paddd %xmm3, %xmm7
-; SSE2-NEXT: paddd %xmm9, %xmm6
-; SSE2-NEXT: paddd %xmm3, %xmm4
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: movdqa 16(%rdi), %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,2,3,4]
+; SSE2-NEXT: paddd %xmm9, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,6,7,8]
+; SSE2-NEXT: paddd %xmm4, %xmm8
; SSE2-NEXT: paddd %xmm9, %xmm2
-; SSE2-NEXT: paddd %xmm3, %xmm8
+; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: paddd %xmm9, %xmm3
+; SSE2-NEXT: paddd %xmm4, %xmm6
; SSE2-NEXT: paddd %xmm9, %xmm1
-; SSE2-NEXT: paddd %xmm3, %xmm0
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: paddd %xmm4, %xmm7
+; SSE2-NEXT: psrld $1, %xmm7
; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm8
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm4
+; SSE2-NEXT: packuswb %xmm7, %xmm1
; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: psrld $1, %xmm7
+; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: packuswb %xmm6, %xmm3
+; SSE2-NEXT: packuswb %xmm3, %xmm1
; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pand %xmm3, %xmm7
-; SSE2-NEXT: packuswb %xmm5, %xmm7
-; SSE2-NEXT: pand %xmm3, %xmm6
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: packuswb %xmm6, %xmm4
-; SSE2-NEXT: packuswb %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm3, %xmm8
-; SSE2-NEXT: packuswb %xmm2, %xmm8
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: packuswb %xmm5, %xmm2
+; SSE2-NEXT: psrld $1, %xmm8
+; SSE2-NEXT: psrld $1, %xmm0
; SSE2-NEXT: packuswb %xmm8, %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: movdqu %xmm4, (%rax)
+; SSE2-NEXT: packuswb %xmm0, %xmm2
+; SSE2-NEXT: movdqu %xmm1, (%rax)
+; SSE2-NEXT: movdqu %xmm2, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i8_const:
@@ -2277,9 +2268,9 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [5,6,7,8]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,2,3,4]
; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm9
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,2,3,4]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8]
; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm5
; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
@@ -2287,30 +2278,21 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm8
+; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5
-; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6
-; AVX1-NEXT: vpsrld $1, %xmm9, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm1, %xmm7, %xmm7
-; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm6
-; AVX1-NEXT: vpackuswb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm5
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1
-; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2
+; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $1, %xmm4, %xmm1
+; AVX1-NEXT: vpsrld $1, %xmm5, %xmm2
+; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrld $1, %xmm6, %xmm2
+; AVX1-NEXT: vpsrld $1, %xmm9, %xmm3
+; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -2567,49 +2549,40 @@ define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6
; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5
; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4
-; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm9
+; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3
; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $1, %ymm0, %ymm10
+; AVX2-NEXT: vpsrld $1, %ymm0, %ymm8
; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
-; AVX2-NEXT: vpsrld $1, %ymm2, %ymm3
-; AVX2-NEXT: vpsrld $1, %ymm9, %ymm8
+; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
+; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
; AVX2-NEXT: vpsrld $1, %ymm6, %ymm6
-; AVX2-NEXT: vpsrld $1, %ymm7, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7
-; AVX2-NEXT: vpackssdw %xmm7, %xmm2, %xmm7
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm7
-; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm0
-; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
+; AVX2-NEXT: vpsrld $1, %ymm7, %ymm7
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm0
+; AVX2-NEXT: vpackssdw %xmm0, %xmm7, %xmm0
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX2-NEXT: vpackssdw %xmm7, %xmm6, %xmm6
+; AVX2-NEXT: vpackuswb %xmm0, %xmm6, %xmm0
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
-; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
-; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; AVX2-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm8, %xmm4
-; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm10, %xmm4
-; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm3
+; AVX2-NEXT: vpackssdw %xmm3, %xmm8, %xmm3
+; AVX2-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 923e1b9b0e0..dc386415934 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -12,7 +12,6 @@ define void @zero128() nounwind ssp {
; CHECK-NEXT: movq _z@{{.*}}(%rip), %rax
; CHECK-NEXT: vmovaps %xmm0, (%rax)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
store <4 x float> zeroinitializer, <4 x float>* @z, align 16
ret void
}
@@ -27,7 +26,6 @@ define void @zero256() nounwind ssp {
; CHECK-NEXT: vmovaps %ymm0, (%rax)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
store <8 x float> zeroinitializer, <8 x float>* @x, align 32
store <4 x double> zeroinitializer, <4 x double>* @y, align 32
ret void
@@ -41,7 +39,6 @@ define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nou
; CHECK-NEXT: vmovaps %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
allocas:
%ptr2vec615 = bitcast [0 x float]* %RET to <8 x float>*
store <8 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float
@@ -59,7 +56,6 @@ define void @ones2([0 x i32]* nocapture %RET, [0 x i32]* nocapture %aFOO) nounwi
; CHECK-NEXT: vmovaps %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
allocas:
%ptr2vec615 = bitcast [0 x i32]* %RET to <8 x i32>*
store <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32>* %ptr2vec615, align 32
@@ -83,7 +79,6 @@ define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind {
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%ptrcast.i33.i = bitcast [0 x float]* %aFOO to i32*
%val.i34.i = load i32, i32* %ptrcast.i33.i, align 4
%ptroffset.i22.i992 = getelementptr [0 x float], [0 x float]* %aFOO, i64 0, i64 1
@@ -102,7 +97,6 @@ define <16 x float> @fneg(<16 x float> %a) nounwind {
; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vxorps %ymm2, %ymm1, %ymm1
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
ret <16 x float> %1
}
@@ -114,7 +108,6 @@ define <16 x i16> @build_vec_16x16(i16 %a) nounwind readonly {
; CHECK-NEXT: movzwl %di, %eax
; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%res = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %a, i32 0
ret <16 x i16> %res
}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index 44eb14160ee..e508e345de6 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -581,15 +581,10 @@ declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
define <8 x float> @test_x86_avx_rcp_ps_256(<8 x float> %a0) {
-; AVX-LABEL: test_x86_avx_rcp_ps_256:
-; AVX: # BB#0:
-; AVX-NEXT: vrcpps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x53,0xc0]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_rcp_ps_256:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vrcp14ps %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x4c,0xc0]
-; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_rcp_ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vrcpps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x53,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -619,15 +614,10 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno
define <8 x float> @test_x86_avx_rsqrt_ps_256(<8 x float> %a0) {
-; AVX-LABEL: test_x86_avx_rsqrt_ps_256:
-; AVX: # BB#0:
-; AVX-NEXT: vrsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x52,0xc0]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_rsqrt_ps_256:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vrsqrt14ps %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x4e,0xc0]
-; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_rsqrt_ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vrsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x52,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -635,10 +625,15 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_sqrt_pd_256:
-; CHECK: # BB#0:
-; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0]
-; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX: # BB#0:
+; AVX-NEXT: vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vsqrtpd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x51,0xc0]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -646,10 +641,15 @@ declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_sqrt_ps_256:
-; CHECK: # BB#0:
-; CHECK-NEXT: vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0]
-; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX: # BB#0:
+; AVX-NEXT: vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vsqrtps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x51,0xc0]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll
index 44d13db65c9..858a27b1d48 100644
--- a/test/CodeGen/X86/avx-schedule.ll
+++ b/test/CodeGen/X86/avx-schedule.ll
@@ -3982,8 +3982,8 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
;
; SKX-LABEL: test_rcpps:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
-; SKX-NEXT: vrcp14ps (%rdi), %ymm1 # sched: [11:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpps (%rdi), %ymm1 # sched: [11:1.00]
; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
@@ -4174,8 +4174,8 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
;
; SKX-LABEL: test_rsqrtps:
; SKX: # BB#0:
-; SKX-NEXT: vrsqrt14ps %ymm0, %ymm0 # sched: [4:1.00]
-; SKX-NEXT: vrsqrt14ps (%rdi), %ymm1 # sched: [11:1.00]
+; SKX-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [11:1.00]
; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index b75bd8cc3ee..909e8398680 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -699,11 +699,13 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; AVX512BW-NEXT: jg LBB17_1
; AVX512BW-NEXT: ## BB#2:
; AVX512BW-NEXT: vpcmpltud %zmm2, %zmm1, %k0
-; AVX512BW-NEXT: jmp LBB17_3
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB17_1:
-; AVX512BW-NEXT: vpcmpgtd %zmm2, %zmm0, %k0
-; AVX512BW-NEXT: LBB17_3:
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: vpcmpgtd %zmm2, %zmm0, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-regcall-Mask.ll b/test/CodeGen/X86/avx512-regcall-Mask.ll
index bb541f46567..fa6adec675f 100644
--- a/test/CodeGen/X86/avx512-regcall-Mask.ll
+++ b/test/CodeGen/X86/avx512-regcall-Mask.ll
@@ -209,12 +209,18 @@ define i64 @caller_argv64i1() #0 {
; LINUXOSX64-NEXT: pushq %rax
; LINUXOSX64-NEXT: .cfi_adjust_cfa_offset 8
; LINUXOSX64-NEXT: callq test_argv64i1
-; LINUXOSX64-NEXT: addq $24, %rsp
+; LINUXOSX64-NEXT: addq $16, %rsp
; LINUXOSX64-NEXT: .cfi_adjust_cfa_offset -16
+; LINUXOSX64-NEXT: addq $8, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 40
; LINUXOSX64-NEXT: popq %r12
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 32
; LINUXOSX64-NEXT: popq %r13
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 24
; LINUXOSX64-NEXT: popq %r14
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %r15
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i64 4294967298 to <64 x i1>
@@ -287,6 +293,7 @@ define <64 x i1> @caller_retv64i1() #0 {
; LINUXOSX64-NEXT: kmovq %rax, %k0
; LINUXOSX64-NEXT: vpmovm2b %k0, %zmm0
; LINUXOSX64-NEXT: popq %rax
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <64 x i1> @test_retv64i1()
@@ -397,7 +404,9 @@ define x86_regcallcc i32 @test_argv32i1(<32 x i1> %x0, <32 x i1> %x1, <32 x i1>
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
; LINUXOSX64-NEXT: addq $128, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: vzeroupper
; LINUXOSX64-NEXT: retq
entry:
@@ -451,6 +460,7 @@ define i32 @caller_argv32i1() #0 {
; LINUXOSX64-NEXT: movl $1, %edx
; LINUXOSX64-NEXT: callq test_argv32i1
; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i32 1 to <32 x i1>
@@ -513,6 +523,7 @@ define i32 @caller_retv32i1() #0 {
; LINUXOSX64-NEXT: callq test_retv32i1
; LINUXOSX64-NEXT: incl %eax
; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <32 x i1> @test_retv32i1()
@@ -626,7 +637,9 @@ define x86_regcallcc i16 @test_argv16i1(<16 x i1> %x0, <16 x i1> %x1, <16 x i1>
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
; LINUXOSX64-NEXT: addq $128, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%res = call i16 @test_argv16i1helper(<16 x i1> %x0, <16 x i1> %x1, <16 x i1> %x2)
ret i16 %res
@@ -678,6 +691,7 @@ define i16 @caller_argv16i1() #0 {
; LINUXOSX64-NEXT: movl $1, %edx
; LINUXOSX64-NEXT: callq test_argv16i1
; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i16 1 to <16 x i1>
@@ -746,6 +760,7 @@ define i16 @caller_retv16i1() #0 {
; LINUXOSX64-NEXT: incl %eax
; LINUXOSX64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <16 x i1> @test_retv16i1()
@@ -859,7 +874,9 @@ define x86_regcallcc i8 @test_argv8i1(<8 x i1> %x0, <8 x i1> %x1, <8 x i1> %x2)
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
; LINUXOSX64-NEXT: addq $128, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%res = call i8 @test_argv8i1helper(<8 x i1> %x0, <8 x i1> %x1, <8 x i1> %x2)
ret i8 %res
@@ -911,6 +928,7 @@ define i8 @caller_argv8i1() #0 {
; LINUXOSX64-NEXT: movl $1, %edx
; LINUXOSX64-NEXT: callq test_argv8i1
; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i8 1 to <8 x i1>
@@ -984,9 +1002,11 @@ define <8 x i1> @caller_retv8i1() #0 {
; LINUXOSX64-NEXT: vpmovm2w %k0, %zmm0
; LINUXOSX64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; LINUXOSX64-NEXT: popq %rax
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: vzeroupper
; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <8 x i1> @test_retv8i1()
ret <8 x i1> %call
}
+
diff --git a/test/CodeGen/X86/avx512-regcall-NoMask.ll b/test/CodeGen/X86/avx512-regcall-NoMask.ll
index 43a1871245b..b4f1d2c776d 100644
--- a/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -63,6 +63,7 @@ define x86_regcallcc i1 @test_CallargReti1(i1 %a) {
; LINUXOSX64-NEXT: callq test_argReti1
; LINUXOSX64-NEXT: incb %al
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = add i1 %a, 1
%c = call x86_regcallcc i1 @test_argReti1(i1 %b)
@@ -130,6 +131,7 @@ define x86_regcallcc i8 @test_CallargReti8(i8 %a) {
; LINUXOSX64-NEXT: callq test_argReti8
; LINUXOSX64-NEXT: incb %al
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = add i8 %a, 1
%c = call x86_regcallcc i8 @test_argReti8(i8 %b)
@@ -200,6 +202,7 @@ define x86_regcallcc i16 @test_CallargReti16(i16 %a) {
; LINUXOSX64-NEXT: incl %eax
; LINUXOSX64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = add i16 %a, 1
%c = call x86_regcallcc i16 @test_argReti16(i16 %b)
@@ -261,6 +264,7 @@ define x86_regcallcc i32 @test_CallargReti32(i32 %a) {
; LINUXOSX64-NEXT: callq test_argReti32
; LINUXOSX64-NEXT: incl %eax
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = add i32 %a, 1
%c = call x86_regcallcc i32 @test_argReti32(i32 %b)
@@ -327,6 +331,7 @@ define x86_regcallcc i64 @test_CallargReti64(i64 %a) {
; LINUXOSX64-NEXT: callq test_argReti64
; LINUXOSX64-NEXT: incq %rax
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = add i64 %a, 1
%c = call x86_regcallcc i64 @test_argReti64(i64 %b)
@@ -406,7 +411,9 @@ define x86_regcallcc float @test_CallargRetFloat(float %a) {
; LINUXOSX64-NEXT: vaddss %xmm8, %xmm0, %xmm0
; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
; LINUXOSX64-NEXT: addq $16, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = fadd float 1.0, %a
%c = call x86_regcallcc float @test_argRetFloat(float %b)
@@ -486,7 +493,9 @@ define x86_regcallcc double @test_CallargRetDouble(double %a) {
; LINUXOSX64-NEXT: vaddsd %xmm8, %xmm0, %xmm0
; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
; LINUXOSX64-NEXT: addq $16, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = fadd double 1.0, %a
%c = call x86_regcallcc double @test_argRetDouble(double %b)
@@ -548,6 +557,7 @@ define x86_regcallcc x86_fp80 @test_CallargRetf80(x86_fp80 %a) {
; LINUXOSX64-NEXT: callq test_argRetf80
; LINUXOSX64-NEXT: fadd %st(0), %st(0)
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = fadd x86_fp80 %a, %a
%c = call x86_regcallcc x86_fp80 @test_argRetf80(x86_fp80 %b)
@@ -611,6 +621,7 @@ define x86_regcallcc [4 x i32]* @test_CallargRetPointer([4 x i32]* %a) {
; LINUXOSX64-NEXT: callq test_argRetPointer
; LINUXOSX64-NEXT: incl %eax
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = ptrtoint [4 x i32]* %a to i32
%c = add i32 %b, 1
@@ -694,7 +705,9 @@ define x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i32> %a) {
; LINUXOSX64-NEXT: vmovdqa32 %xmm8, %xmm0 {%k1}
; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
; LINUXOSX64-NEXT: addq $16, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = call x86_regcallcc <4 x i32> @test_argRet128Vector(<4 x i32> %a, <4 x i32> %a)
%c = select <4 x i1> undef , <4 x i32> %a, <4 x i32> %b
@@ -768,7 +781,9 @@ define x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i32> %a) {
; LINUXOSX64-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
; LINUXOSX64-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
; LINUXOSX64-NEXT: addq $48, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = call x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i32> %a, <8 x i32> %a)
%c = select <8 x i1> undef , <8 x i32> %a, <8 x i32> %b
@@ -842,7 +857,9 @@ define x86_regcallcc <16 x i32> @test_CallargRet512Vector(<16 x i32> %a) {
; LINUXOSX64-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
; LINUXOSX64-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; LINUXOSX64-NEXT: addq $112, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = call x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i32> %a, <16 x i32> %a)
%c = select <16 x i1> undef , <16 x i32> %a, <16 x i32> %b
diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll
index 8372fbdb9ab..abc8c1a7513 100755
--- a/test/CodeGen/X86/avx512-schedule.ll
+++ b/test/CodeGen/X86/avx512-schedule.ll
@@ -8839,6 +8839,7 @@ define <16 x float> @broadcast_ss_spill(float %x) {
; GENERIC-NEXT: callq func_f32
; GENERIC-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload
; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33]
+; GENERIC-NEXT: .cfi_def_cfa_offset 8
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: broadcast_ss_spill:
@@ -8852,6 +8853,7 @@ define <16 x float> @broadcast_ss_spill(float %x) {
; SKX-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50]
; SKX-NEXT: # sched: [8:0.50]
; SKX-NEXT: addq $24, %rsp # sched: [1:0.25]
+; SKX-NEXT: .cfi_def_cfa_offset 8
; SKX-NEXT: retq # sched: [7:1.00]
%a = fadd float %x, %x
call void @func_f32(float %a)
@@ -8872,6 +8874,7 @@ define <8 x double> @broadcast_sd_spill(double %x) {
; GENERIC-NEXT: callq func_f64
; GENERIC-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload
; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33]
+; GENERIC-NEXT: .cfi_def_cfa_offset 8
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: broadcast_sd_spill:
@@ -8885,6 +8888,7 @@ define <8 x double> @broadcast_sd_spill(double %x) {
; SKX-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50]
; SKX-NEXT: # sched: [8:0.50]
; SKX-NEXT: addq $24, %rsp # sched: [1:0.25]
+; SKX-NEXT: .cfi_def_cfa_offset 8
; SKX-NEXT: retq # sched: [7:1.00]
%a = fadd double %x, %x
call void @func_f64(double %a)
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index 43cf9ee7358..51a7c685ed4 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -115,6 +115,7 @@ define <16 x double> @select04(<16 x double> %a, <16 x double> %b) {
; X86-NEXT: vmovaps 8(%ebp), %zmm1
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
; X86-NEXT: retl
;
; X64-LABEL: select04:
diff --git a/test/CodeGen/X86/avx512-shuffle-schedule.ll b/test/CodeGen/X86/avx512-shuffle-schedule.ll
index c59fb5b97bc..c95f0d40fbf 100755
--- a/test/CodeGen/X86/avx512-shuffle-schedule.ll
+++ b/test/CodeGen/X86/avx512-shuffle-schedule.ll
@@ -9533,18 +9533,18 @@ define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %ve
define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9555,18 +9555,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x flo
define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9576,18 +9574,18 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8
define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9598,18 +9596,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x flo
define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9619,18 +9615,18 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8
define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9641,18 +9637,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x flo
define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9675,18 +9669,18 @@ define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %ve
define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9697,18 +9691,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x flo
define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9732,18 +9724,18 @@ define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9755,18 +9747,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9778,18 +9768,18 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9801,18 +9791,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9824,18 +9812,18 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9847,18 +9835,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9884,18 +9870,18 @@ define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9907,18 +9893,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -10337,18 +10321,18 @@ define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10359,18 +10343,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10380,18 +10362,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10402,18 +10384,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10423,18 +10403,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10445,18 +10425,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10479,18 +10457,18 @@ define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10501,18 +10479,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10536,18 +10512,18 @@ define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10559,18 +10535,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10582,18 +10556,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10605,18 +10579,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10628,18 +10600,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10651,18 +10623,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10688,18 +10658,18 @@ define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10711,18 +10681,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -11128,12 +11096,12 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec
define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
; GENERIC-LABEL: test_8xi32_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
ret <8 x i32> %res
@@ -11141,18 +11109,18 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11163,18 +11131,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11184,18 +11150,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11206,18 +11172,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11227,18 +11191,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11249,18 +11213,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11270,12 +11232,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
; GENERIC-LABEL: test_8xi32_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
ret <8 x i32> %res
@@ -11283,18 +11245,18 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11305,18 +11267,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11326,12 +11286,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
; GENERIC-LABEL: test_8xi32_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11340,18 +11300,18 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11363,18 +11323,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11386,18 +11344,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11409,18 +11367,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11432,18 +11388,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11455,18 +11411,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11478,12 +11432,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
; GENERIC-LABEL: test_8xi32_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11492,18 +11446,18 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11515,18 +11469,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11932,12 +11884,12 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16
define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
; GENERIC-LABEL: test_4xi64_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
ret <4 x i64> %res
@@ -11945,18 +11897,18 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -11967,18 +11919,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -11988,18 +11938,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12010,18 +11960,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12031,18 +11979,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12053,18 +12001,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12074,12 +12020,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
; GENERIC-LABEL: test_4xi64_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
ret <4 x i64> %res
@@ -12087,18 +12033,18 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12109,18 +12055,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12130,12 +12074,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
; GENERIC-LABEL: test_4xi64_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12144,18 +12088,18 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12167,18 +12111,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12190,18 +12132,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12213,18 +12155,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12236,18 +12176,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12259,18 +12199,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12282,12 +12220,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
; GENERIC-LABEL: test_4xi64_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12296,18 +12234,18 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12319,18 +12257,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
diff --git a/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll b/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
index c957a85a885..799bbc11bee 100644
--- a/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
+++ b/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
@@ -14,10 +14,10 @@ define <8 x float> @test_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec
define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -28,10 +28,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x floa
define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -41,10 +40,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -55,10 +54,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x floa
define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -68,10 +66,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -82,10 +80,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x floa
define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -103,10 +100,10 @@ define <8 x float> @test_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec
define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -117,10 +114,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x floa
define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -139,10 +135,10 @@ define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -154,10 +150,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -169,10 +164,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -184,10 +179,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -199,10 +193,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -214,10 +208,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -238,10 +231,10 @@ define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -253,10 +246,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -530,10 +522,10 @@ define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -544,10 +536,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -557,10 +548,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -571,10 +562,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -584,10 +574,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -598,10 +588,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -619,10 +608,10 @@ define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -633,10 +622,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -655,10 +643,10 @@ define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -670,10 +658,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -685,10 +672,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -700,10 +687,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -715,10 +701,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -730,10 +716,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -754,10 +739,10 @@ define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -769,10 +754,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1038,7 +1022,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec
define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
; CHECK-LABEL: test_8xi32_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
ret <8 x i32> %res
@@ -1046,10 +1030,10 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1060,10 +1044,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1073,10 +1056,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1087,10 +1070,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1100,10 +1082,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1114,10 +1096,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1127,7 +1108,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
; CHECK-LABEL: test_8xi32_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
ret <8 x i32> %res
@@ -1135,10 +1116,10 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1149,10 +1130,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1162,7 +1142,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
; CHECK-LABEL: test_8xi32_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1171,10 +1151,10 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1186,10 +1166,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1201,10 +1180,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1216,10 +1195,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1231,10 +1209,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1246,10 +1224,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1261,7 +1238,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
; CHECK-LABEL: test_8xi32_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1270,10 +1247,10 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1285,10 +1262,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1554,7 +1530,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16
define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
; CHECK-LABEL: test_4xi64_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
ret <4 x i64> %res
@@ -1562,10 +1538,10 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1576,10 +1552,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1589,10 +1564,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1603,10 +1578,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1616,10 +1590,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1630,10 +1604,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1643,7 +1616,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
; CHECK-LABEL: test_4xi64_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
ret <4 x i64> %res
@@ -1651,10 +1624,10 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1665,10 +1638,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1678,7 +1650,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
; CHECK-LABEL: test_4xi64_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1687,10 +1659,10 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1702,10 +1674,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1717,10 +1688,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1732,10 +1703,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1747,10 +1717,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1762,10 +1732,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1777,7 +1746,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
; CHECK-LABEL: test_4xi64_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1786,10 +1755,10 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1801,10 +1770,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
index 23d66457994..ff25c005e9c 100644
--- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -46,8 +46,6 @@ define <8 x i1> @test3(<4 x i1> %a) {
; CHECK: # BB#0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
-; CHECK-NEXT: kshiftlb $4, %k0, %k0
-; CHECK-NEXT: kshiftrb $4, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 584968f1c6e..9aacb23fbd5 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -413,6 +413,7 @@ define <16 x float> @broadcast_ss_spill(float %x) {
; ALL-NEXT: callq func_f32
; ALL-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload
; ALL-NEXT: addq $24, %rsp
+; ALL-NEXT: .cfi_def_cfa_offset 8
; ALL-NEXT: retq
%a = fadd float %x, %x
call void @func_f32(float %a)
@@ -432,6 +433,7 @@ define <8 x double> @broadcast_sd_spill(double %x) {
; ALL-NEXT: callq func_f64
; ALL-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload
; ALL-NEXT: addq $24, %rsp
+; ALL-NEXT: .cfi_def_cfa_offset 8
; ALL-NEXT: retq
%a = fadd double %x, %x
call void @func_f64(double %a)
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
index d1bf8fd5f3f..7f170cd51bf 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
@@ -717,6 +717,7 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: vpbroadcastb %eax, %zmm3 {%k1}
; X32-NEXT: vmovdqa64 %zmm3, %zmm0
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_set1_epi8:
@@ -1444,6 +1445,7 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: korq %k0, %k1, %k1
; X32-NEXT: vpbroadcastb %eax, %zmm0 {%k1} {z}
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_set1_epi8:
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index a5ef1809157..87565ac129b 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -355,6 +355,7 @@ define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
@@ -380,6 +381,7 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
@@ -445,6 +447,7 @@ define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
@@ -470,6 +473,7 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
@@ -1702,6 +1706,7 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -2503,8 +2508,11 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: addl %esi, %eax
; AVX512F-32-NEXT: adcl %ecx, %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 12
; AVX512F-32-NEXT: popl %esi
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
; AVX512F-32-NEXT: popl %ebx
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
@@ -2586,6 +2594,7 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -3387,8 +3396,11 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: addl %esi, %eax
; AVX512F-32-NEXT: adcl %ecx, %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 12
; AVX512F-32-NEXT: popl %esi
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
; AVX512F-32-NEXT: popl %ebx
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index e23deebd15b..c2620642e5c 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1499,6 +1499,7 @@ define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
ret i64 %res
@@ -1522,6 +1523,7 @@ define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8> %x0)
ret i64 %res
@@ -1712,6 +1714,7 @@ define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $20, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
%res1 = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
@@ -1776,6 +1779,7 @@ define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $20, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
%res1 = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
diff --git a/test/CodeGen/X86/avx512bw-vec-test-testn.ll b/test/CodeGen/X86/avx512bw-vec-test-testn.ll
index 6dd6440faa1..82d0b8846de 100644
--- a/test/CodeGen/X86/avx512bw-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512bw-vec-test-testn.ll
@@ -5,9 +5,7 @@
define zeroext i32 @TEST_mm512_test_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqw %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestmw %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -24,9 +22,7 @@ entry:
define zeroext i64 @TEST_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqb %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestmb %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovq %k0, %rax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -42,10 +38,8 @@ entry:
define zeroext i32 @TEST_mm512_mask_test_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmw %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -63,10 +57,8 @@ entry:
define zeroext i64 @TEST_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovq %rdi, %k1
-; CHECK-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmb %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovq %k0, %rax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -84,9 +76,7 @@ entry:
define zeroext i32 @TEST_mm512_testn_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestnmw %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -103,9 +93,7 @@ entry:
define zeroext i64 @TEST_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestnmb %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovq %k0, %rax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -121,10 +109,8 @@ entry:
define zeroext i32 @TEST_mm512_mask_testn_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmw %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -142,10 +128,8 @@ entry:
define zeroext i64 @TEST_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovq %rdi, %k1
-; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmb %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovq %k0, %rax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll b/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
index f67ceb2fe04..44075deb1d9 100644
--- a/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
@@ -5,9 +5,7 @@
define zeroext i16 @TEST_mm_test_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k0
+; CHECK-NEXT: vptestmb %xmm0, %xmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -23,10 +21,8 @@ entry:
define zeroext i16 @TEST_mm_mask_test_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_mask_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmb %xmm0, %xmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -44,9 +40,7 @@ entry:
define zeroext i8 @TEST_mm_test_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k0
+; CHECK-NEXT: vptestmw %xmm0, %xmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -62,10 +56,8 @@ entry:
define zeroext i8 @TEST_mm_mask_test_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_mask_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmw %xmm0, %xmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -83,9 +75,7 @@ entry:
define zeroext i16 @TEST_mm_testn_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
+; CHECK-NEXT: vptestnmb %xmm0, %xmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -101,10 +91,8 @@ entry:
define zeroext i16 @TEST_mm_mask_testn_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_mask_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmb %xmm0, %xmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -122,9 +110,7 @@ entry:
define zeroext i8 @TEST_mm_testn_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
+; CHECK-NEXT: vptestnmw %xmm0, %xmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -140,10 +126,8 @@ entry:
define zeroext i8 @TEST_mm_mask_testn_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_mask_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmw %xmm0, %xmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -161,9 +145,7 @@ entry:
define i32 @TEST_mm256_test_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0
+; CHECK-NEXT: vptestmb %ymm0, %ymm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -179,10 +161,8 @@ entry:
define i32 @TEST_mm256_mask_test_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_mask_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1}
+; CHECK-NEXT: vptestmb %ymm0, %ymm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -200,9 +180,7 @@ entry:
define zeroext i16 @TEST_mm256_test_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k0
+; CHECK-NEXT: vptestmw %ymm0, %ymm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -219,10 +197,8 @@ entry:
define zeroext i16 @TEST_mm256_mask_test_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_mask_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1}
+; CHECK-NEXT: vptestmw %ymm0, %ymm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -241,9 +217,7 @@ entry:
define i32 @TEST_mm256_testn_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
+; CHECK-NEXT: vptestnmb %ymm0, %ymm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -259,10 +233,8 @@ entry:
define i32 @TEST_mm256_mask_testn_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_mask_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmb %ymm0, %ymm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -280,9 +252,7 @@ entry:
define zeroext i16 @TEST_mm256_testn_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
+; CHECK-NEXT: vptestnmw %ymm0, %ymm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -299,10 +269,8 @@ entry:
define zeroext i16 @TEST_mm256_mask_testn_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_mask_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmw %ymm0, %ymm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll
new file mode 100644
index 00000000000..ca5e5523a9d
--- /dev/null
+++ b/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512cd | FileCheck %s
+
+define <8 x i64> @test_mm512_broadcastmb_epi64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm512_broadcastmb_epi64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = icmp eq <8 x i64> %a, %b
+ %1 = bitcast <8 x i1> %0 to i8
+ %conv.i = zext i8 %1 to i64
+ %vecinit.i.i = insertelement <8 x i64> undef, i64 %conv.i, i32 0
+ %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
+ ret <8 x i64> %vecinit7.i.i
+}
+
+define <8 x i64> @test_mm512_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm512_broadcastmw_epi32:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %a to <16 x i32>
+ %1 = bitcast <8 x i64> %b to <16 x i32>
+ %2 = icmp eq <16 x i32> %0, %1
+ %3 = bitcast <16 x i1> %2 to i16
+ %conv.i = zext i16 %3 to i32
+ %vecinit.i.i = insertelement <16 x i32> undef, i32 %conv.i, i32 0
+ %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
+ %4 = bitcast <16 x i32> %vecinit15.i.i to <8 x i64>
+ ret <8 x i64> %4
+}
+
+
diff --git a/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll
index e5dbff9ac51..92dfe1e087a 100644
--- a/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll
@@ -45,3 +45,26 @@ define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
%res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
ret <8 x i64> %res
}
+
+define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) {
+; CHECK-LABEL: test_x86_vbroadcastmw_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: vpbroadcastd %eax, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0)
+ ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16)
+
+define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) {
+; CHECK-LABEL: test_x86_broadcastmb_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: vpbroadcastq %rax, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0)
+ ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8)
+
diff --git a/test/CodeGen/X86/avx512cd-intrinsics.ll b/test/CodeGen/X86/avx512cd-intrinsics.ll
index 7e5a3e8fe25..ab8c80f8dd3 100644
--- a/test/CodeGen/X86/avx512cd-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cd-intrinsics.ll
@@ -1,28 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s
-define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) {
-; CHECK-LABEL: test_x86_vbroadcastmw_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmw2d %k0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0)
- ret <16 x i32> %res
-}
-declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16)
-
-define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) {
-; CHECK-LABEL: test_x86_broadcastmb_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmb2q %k0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0)
- ret <8 x i64> %res
-}
-declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8)
-
declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
define <8 x i64> @test_conflict_q(<8 x i64> %a) {
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
index f8f47c87100..0e310be3489 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
@@ -69,3 +69,47 @@ define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64>
ret <4 x i64> %res2
}
+define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) {
+; CHECK-LABEL: test_x86_vbroadcastmw_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: vpbroadcastd %eax, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ;
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16)
+
+define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) {
+; CHECK-LABEL: test_x86_vbroadcastmw_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: vpbroadcastd %eax, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16)
+
+define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) {
+; CHECK-LABEL: test_x86_broadcastmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: vpbroadcastq %rax, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ;
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8)
+
+define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) {
+; CHECK-LABEL: test_x86_broadcastmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: vpbroadcastq %rax, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8)
+
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
index 96254f7c95b..2fb50297c62 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
@@ -147,46 +147,3 @@ define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i
ret <4 x i64> %res2
}
-define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) {
-; CHECK-LABEL: test_x86_vbroadcastmw_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ;
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16)
-
-define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) {
-; CHECK-LABEL: test_x86_vbroadcastmw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16)
-
-define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) {
-; CHECK-LABEL: test_x86_broadcastmb_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ;
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8)
-
-define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) {
-; CHECK-LABEL: test_x86_broadcastmb_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8)
diff --git a/test/CodeGen/X86/avx512f-vec-test-testn.ll b/test/CodeGen/X86/avx512f-vec-test-testn.ll
index c9c0c2251a4..e9cdacc354f 100644
--- a/test/CodeGen/X86/avx512f-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512f-vec-test-testn.ll
@@ -5,9 +5,7 @@
define zeroext i8 @TEST_mm512_test_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_test_epi64_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -23,9 +21,7 @@ entry:
define zeroext i16 @TEST_mm512_test_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_test_epi32_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestmd %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -42,10 +38,8 @@ entry:
define zeroext i8 @TEST_mm512_mask_test_epi64_mask(i8 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_test_epi64_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -63,10 +57,8 @@ entry:
define zeroext i16 @TEST_mm512_mask_test_epi32_mask(i16 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_test_epi32_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -85,9 +77,7 @@ entry:
define zeroext i8 @TEST_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_testn_epi64_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -103,9 +93,7 @@ entry:
define zeroext i16 @TEST_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_testn_epi32_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -122,10 +110,8 @@ entry:
define zeroext i8 @TEST_mm512_mask_testn_epi64_mask(i8 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_testn_epi64_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -143,10 +129,8 @@ entry:
define zeroext i16 @TEST_mm512_mask_testn_epi32_mask(i16 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_testn_epi32_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
index f5578d6cc88..3f4a696af0c 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
@@ -233,6 +233,7 @@ define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64>
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastd_epi32:
@@ -265,6 +266,7 @@ define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastd_epi32:
@@ -369,6 +371,7 @@ define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64>
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastq_epi64:
@@ -398,6 +401,7 @@ define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastq_epi64:
@@ -441,6 +445,7 @@ define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_broadcastq_epi64:
@@ -470,6 +475,7 @@ define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
@@ -513,6 +519,7 @@ define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastsd_pd:
@@ -542,6 +549,7 @@ define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastsd_pd:
@@ -585,6 +593,7 @@ define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_broadcastsd_pd:
@@ -614,6 +623,7 @@ define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
@@ -657,6 +667,7 @@ define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x fl
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastss_ps:
@@ -686,6 +697,7 @@ define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastss_ps:
@@ -781,6 +793,7 @@ define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x doub
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_movddup_pd:
@@ -810,6 +823,7 @@ define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_movddup_pd:
@@ -853,6 +867,7 @@ define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x d
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_movddup_pd:
@@ -882,6 +897,7 @@ define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_movddup_pd:
@@ -925,6 +941,7 @@ define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_movehdup_ps:
@@ -954,6 +971,7 @@ define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_movehdup_ps:
@@ -1049,6 +1067,7 @@ define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_moveldup_ps:
@@ -1078,6 +1097,7 @@ define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_moveldup_ps:
@@ -1173,6 +1193,7 @@ define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_permutex_epi64:
@@ -1202,6 +1223,7 @@ define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_permutex_epi64:
@@ -1245,6 +1267,7 @@ define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_permutex_pd:
@@ -1274,6 +1297,7 @@ define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_permutex_pd:
@@ -1317,6 +1341,7 @@ define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x doub
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_shuffle_pd:
@@ -1346,6 +1371,7 @@ define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x dou
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_shuffle_pd:
@@ -1389,6 +1415,7 @@ define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x d
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_shuffle_pd:
@@ -1418,6 +1445,7 @@ define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_shuffle_pd:
@@ -1461,6 +1489,7 @@ define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float>
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_shuffle_ps:
@@ -1490,6 +1519,7 @@ define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_shuffle_ps:
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index b6723ee50b0..6c6fad794c8 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -4712,8 +4712,8 @@ declare <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32>, <8 x i32>, i32,
define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x03,0xd9,0x06]
-; CHECK-NEXT: ## ymm3 = ymm1[6,7],ymm0[0,1,2,3,4,5]
+; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03]
+; CHECK-NEXT: ## ymm3 = ymm1[3],ymm0[0,1,2]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x06]
; CHECK-NEXT: ## ymm2 {%k1} = ymm1[6,7],ymm0[0,1,2,3,4,5]
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index 9098ca30897..35fecf8955c 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -2729,8 +2729,8 @@ define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x
; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1,2,3],ymm1[4,5,6,7]
; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xd1,0x16]
; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc1,0x16]
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: vperm2f128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x30]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0]
; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2752,7 +2752,7 @@ define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4
; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xd1,0x16]
; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc1,0x16]
+; CHECK-NEXT: vperm2f128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x30]
; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0]
; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
@@ -2773,8 +2773,8 @@ define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xd1,0x16]
; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc1,0x16]
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: vperm2i128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x46,0xc1,0x30]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
@@ -2791,7 +2791,7 @@ define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xd1,0x16]
; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc1,0x16]
+; CHECK-NEXT: vperm2i128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x46,0xc1,0x30]
; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
diff --git a/test/CodeGen/X86/avx512vl-vbroadcast.ll b/test/CodeGen/X86/avx512vl-vbroadcast.ll
index 9fc957297e2..1098e7bffe0 100644
--- a/test/CodeGen/X86/avx512vl-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512vl-vbroadcast.ll
@@ -12,6 +12,7 @@ define <8 x float> @_256_broadcast_ss_spill(float %x) {
; CHECK-NEXT: callq func_f32
; CHECK-NEXT: vbroadcastss (%rsp), %ymm0 # 16-byte Folded Reload
; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%a = fadd float %x, %x
call void @func_f32(float %a)
@@ -30,6 +31,7 @@ define <4 x float> @_128_broadcast_ss_spill(float %x) {
; CHECK-NEXT: callq func_f32
; CHECK-NEXT: vbroadcastss (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%a = fadd float %x, %x
call void @func_f32(float %a)
@@ -49,6 +51,7 @@ define <4 x double> @_256_broadcast_sd_spill(double %x) {
; CHECK-NEXT: callq func_f64
; CHECK-NEXT: vbroadcastsd (%rsp), %ymm0 # 16-byte Folded Reload
; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%a = fadd double %x, %x
call void @func_f64(double %a)
diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
index 5ee06fde127..bccf953fb0b 100644
--- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
@@ -109,6 +109,7 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -227,6 +228,7 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -348,6 +350,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -470,6 +473,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -597,6 +601,7 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -720,6 +725,7 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -846,6 +852,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -973,6 +980,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1024,6 +1032,7 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1071,6 +1080,7 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1129,6 +1139,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1188,6 +1199,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1217,8 +1229,6 @@ define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -1246,8 +1256,6 @@ define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -1278,8 +1286,6 @@ define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -1311,8 +1317,6 @@ define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -1392,6 +1396,7 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1465,6 +1470,7 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1541,6 +1547,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1618,6 +1625,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1700,6 +1708,7 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1778,6 +1787,7 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1859,6 +1869,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1941,6 +1952,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2064,6 +2076,7 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2183,6 +2196,7 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2305,6 +2319,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2428,6 +2443,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2556,6 +2572,7 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2680,6 +2697,7 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2807,6 +2825,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2935,6 +2954,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -3288,6 +3308,7 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -3552,6 +3573,7 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -3912,6 +3934,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -4188,6 +4211,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5051,6 +5075,7 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5092,6 +5117,7 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5153,6 +5179,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5216,6 +5243,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5263,6 +5291,7 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5326,6 +5355,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5379,6 +5409,7 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5426,6 +5457,7 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5493,6 +5525,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5562,6 +5595,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5615,6 +5649,7 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5684,6 +5719,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5957,6 +5993,7 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6030,6 +6067,7 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6106,6 +6144,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6183,6 +6222,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6260,6 +6300,7 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6337,6 +6378,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6420,6 +6462,7 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6498,6 +6541,7 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6579,6 +6623,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6661,6 +6706,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6743,6 +6789,7 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6825,6 +6872,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6946,6 +6994,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7062,6 +7111,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7181,6 +7231,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7301,6 +7352,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7421,6 +7473,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7541,6 +7594,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7667,6 +7721,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7788,6 +7843,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7912,6 +7968,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -8037,6 +8094,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -8162,6 +8220,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -8287,6 +8346,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9131,6 +9191,7 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9172,6 +9233,7 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9225,6 +9287,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9280,6 +9343,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9327,6 +9391,7 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9382,6 +9447,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9435,6 +9501,7 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9482,6 +9549,7 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9541,6 +9609,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9602,6 +9671,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9655,6 +9725,7 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9716,6 +9787,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10607,6 +10679,7 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10650,6 +10723,7 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10713,6 +10787,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10778,6 +10853,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10827,6 +10903,7 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10892,6 +10969,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10947,6 +11025,7 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10996,6 +11075,7 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11065,6 +11145,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11136,6 +11217,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11191,6 +11273,7 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11262,6 +11345,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11509,6 +11593,7 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11580,6 +11665,7 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11654,6 +11740,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11729,6 +11816,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11804,6 +11892,7 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11879,6 +11968,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11960,6 +12050,7 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12036,6 +12127,7 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12115,6 +12207,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12195,6 +12288,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12275,6 +12369,7 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12355,6 +12450,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12478,6 +12574,7 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12596,6 +12693,7 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12717,6 +12815,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12839,6 +12938,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12966,6 +13066,7 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13089,6 +13190,7 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13215,6 +13317,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13342,6 +13445,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13393,6 +13497,7 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13440,6 +13545,7 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13498,6 +13604,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13557,6 +13664,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13586,8 +13694,6 @@ define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -13615,8 +13721,6 @@ define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -13647,8 +13751,6 @@ define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -13680,8 +13782,6 @@ define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -13761,6 +13861,7 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13834,6 +13935,7 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13910,6 +14012,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13987,6 +14090,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14069,6 +14173,7 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14147,6 +14252,7 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14228,6 +14334,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14310,6 +14417,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14433,6 +14541,7 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14552,6 +14661,7 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14674,6 +14784,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14797,6 +14908,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14925,6 +15037,7 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -15049,6 +15162,7 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -15176,6 +15290,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -15304,6 +15419,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -15657,6 +15773,7 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -15921,6 +16038,7 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -16281,6 +16399,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -16557,6 +16676,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17420,6 +17540,7 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17461,6 +17582,7 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17522,6 +17644,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17585,6 +17708,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17632,6 +17756,7 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17695,6 +17820,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17748,6 +17874,7 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17795,6 +17922,7 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17862,6 +17990,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17931,6 +18060,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17984,6 +18114,7 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18053,6 +18184,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18326,6 +18458,7 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18399,6 +18532,7 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18475,6 +18609,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18552,6 +18687,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18629,6 +18765,7 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18706,6 +18843,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18789,6 +18927,7 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18867,6 +19006,7 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18948,6 +19088,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19030,6 +19171,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19112,6 +19254,7 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19194,6 +19337,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19315,6 +19459,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19431,6 +19576,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19550,6 +19696,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19670,6 +19817,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19790,6 +19938,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19910,6 +20059,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20036,6 +20186,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20157,6 +20308,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20281,6 +20433,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20406,6 +20559,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20531,6 +20685,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20656,6 +20811,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21500,6 +21656,7 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21541,6 +21698,7 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21594,6 +21752,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21649,6 +21808,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21696,6 +21856,7 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21751,6 +21912,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21804,6 +21966,7 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21851,6 +22014,7 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21910,6 +22074,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21971,6 +22136,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -22024,6 +22190,7 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -22085,6 +22252,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -22976,6 +23144,7 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23019,6 +23188,7 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23082,6 +23252,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23147,6 +23318,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23196,6 +23368,7 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23261,6 +23434,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23316,6 +23490,7 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23365,6 +23540,7 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23434,6 +23610,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23505,6 +23682,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23560,6 +23738,7 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23631,6 +23810,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23878,6 +24058,7 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23949,6 +24130,7 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24023,6 +24205,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24098,6 +24281,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24173,6 +24357,7 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24248,6 +24433,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24329,6 +24515,7 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24405,6 +24592,7 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24484,6 +24672,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24564,6 +24753,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24644,6 +24834,7 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24724,6 +24915,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24849,6 +25041,7 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24970,6 +25163,7 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25093,6 +25287,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25218,6 +25413,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25347,6 +25543,7 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25473,6 +25670,7 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25601,6 +25799,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25731,6 +25930,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25784,6 +25984,7 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25834,6 +26035,7 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25894,6 +26096,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25956,6 +26159,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25987,8 +26191,6 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -26019,8 +26221,6 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -26053,8 +26253,6 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -26089,8 +26287,6 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -26172,6 +26368,7 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26248,6 +26445,7 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26326,6 +26524,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26406,6 +26605,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26490,6 +26690,7 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26571,6 +26772,7 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26654,6 +26856,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26739,6 +26942,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26864,6 +27068,7 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26986,6 +27191,7 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27110,6 +27316,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27236,6 +27443,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27366,6 +27574,7 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27493,6 +27702,7 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27622,6 +27832,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27753,6 +27964,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -28109,6 +28321,7 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -28378,6 +28591,7 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -28741,6 +28955,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -29022,6 +29237,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -29903,6 +30119,7 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -29947,6 +30164,7 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30008,6 +30226,7 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30072,6 +30291,7 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30121,6 +30341,7 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30184,6 +30405,7 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30239,6 +30461,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30289,6 +30512,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30356,6 +30580,7 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30426,6 +30651,7 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30481,6 +30707,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30550,6 +30777,7 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30823,6 +31051,7 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30896,6 +31125,7 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30972,6 +31202,7 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31049,6 +31280,7 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31126,6 +31358,7 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31203,6 +31436,7 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31286,6 +31520,7 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31364,6 +31599,7 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31445,6 +31681,7 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31527,6 +31764,7 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31609,6 +31847,7 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31691,6 +31930,7 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31812,6 +32052,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31928,6 +32169,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32047,6 +32289,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32167,6 +32410,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32287,6 +32531,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32407,6 +32652,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32533,6 +32779,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32654,6 +32901,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32778,6 +33026,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32903,6 +33152,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -33028,6 +33278,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -33153,6 +33404,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34023,6 +34275,7 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34067,6 +34320,7 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34120,6 +34374,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34176,6 +34431,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34225,6 +34481,7 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34280,6 +34537,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34335,6 +34593,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34385,6 +34644,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34444,6 +34704,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34506,6 +34767,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34561,6 +34823,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34622,6 +34885,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35543,6 +35807,7 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35589,6 +35854,7 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35654,6 +35920,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35722,6 +35989,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35773,6 +36041,7 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35840,6 +36109,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35897,6 +36167,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35949,6 +36220,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36020,6 +36292,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36094,6 +36367,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36151,6 +36425,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36224,6 +36499,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36471,6 +36747,7 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36542,6 +36819,7 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36616,6 +36894,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36691,6 +36970,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36766,6 +37046,7 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36841,6 +37122,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36922,6 +37204,7 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36998,6 +37281,7 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37077,6 +37361,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37157,6 +37442,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37237,6 +37523,7 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37317,6 +37604,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37443,6 +37731,7 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37564,6 +37853,7 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37688,6 +37978,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37813,6 +38104,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37943,6 +38235,7 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38069,6 +38362,7 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38198,6 +38492,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38328,6 +38623,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38382,6 +38678,7 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38432,6 +38729,7 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38493,6 +38791,7 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38555,6 +38854,7 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38587,8 +38887,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -38619,8 +38917,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -38654,8 +38950,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -38690,8 +38984,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -38774,6 +39066,7 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38850,6 +39143,7 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38929,6 +39223,7 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39009,6 +39304,7 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39094,6 +39390,7 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39175,6 +39472,7 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39259,6 +39557,7 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39344,6 +39643,7 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39470,6 +39770,7 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39592,6 +39893,7 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39717,6 +40019,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39843,6 +40146,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39974,6 +40278,7 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -40101,6 +40406,7 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -40231,6 +40537,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -40362,6 +40669,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -40720,6 +41028,7 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -40989,6 +41298,7 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -41354,6 +41664,7 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -41635,6 +41946,7 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42537,6 +42849,7 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42581,6 +42894,7 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42645,6 +42959,7 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42711,6 +43026,7 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42761,6 +43077,7 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42827,6 +43144,7 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42883,6 +43201,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42933,6 +43252,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43003,6 +43323,7 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43075,6 +43396,7 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43131,6 +43453,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43203,6 +43526,7 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43476,6 +43800,7 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43549,6 +43874,7 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43625,6 +43951,7 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43702,6 +44029,7 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43779,6 +44107,7 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43856,6 +44185,7 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43939,6 +44269,7 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44017,6 +44348,7 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44098,6 +44430,7 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44180,6 +44513,7 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44262,6 +44596,7 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44344,6 +44679,7 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44465,6 +44801,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44581,6 +44918,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44700,6 +45038,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44820,6 +45159,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44940,6 +45280,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45060,6 +45401,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45186,6 +45528,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45307,6 +45650,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45431,6 +45775,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45556,6 +45901,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45681,6 +46027,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45806,6 +46153,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46707,6 +47055,7 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46751,6 +47100,7 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46807,6 +47157,7 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46865,6 +47216,7 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46915,6 +47267,7 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46973,6 +47326,7 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47029,6 +47383,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47079,6 +47434,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47141,6 +47497,7 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47205,6 +47562,7 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47261,6 +47619,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47325,6 +47684,7 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48255,6 +48615,7 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48301,6 +48662,7 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48367,6 +48729,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48435,6 +48798,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48487,6 +48851,7 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48555,6 +48920,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48613,6 +48979,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48665,6 +49032,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48737,6 +49105,7 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48811,6 +49180,7 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48869,6 +49239,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48943,6 +49314,7 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49190,6 +49562,7 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49261,6 +49634,7 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49335,6 +49709,7 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49410,6 +49785,7 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49485,6 +49861,7 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49560,6 +49937,7 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49641,6 +50019,7 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49717,6 +50096,7 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49796,6 +50176,7 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49876,6 +50257,7 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49956,6 +50338,7 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -50036,6 +50419,7 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -50829,6 +51213,7 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -50870,6 +51255,7 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -50913,6 +51299,7 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -50964,6 +51351,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51015,6 +51403,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51068,6 +51457,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51121,6 +51511,7 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51168,6 +51559,7 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51217,6 +51609,7 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51274,6 +51667,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51331,6 +51725,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51390,6 +51785,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51663,6 +52059,7 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51736,6 +52133,7 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51810,6 +52208,7 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51887,6 +52286,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51964,6 +52364,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52042,6 +52443,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52126,6 +52528,7 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52204,6 +52607,7 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52283,6 +52687,7 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52365,6 +52770,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52447,6 +52853,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52530,6 +52937,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52652,6 +53060,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52768,6 +53177,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52885,6 +53295,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53005,6 +53416,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53125,6 +53537,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53246,6 +53659,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53414,6 +53828,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53535,6 +53950,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53657,6 +54073,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53782,6 +54199,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53907,6 +54325,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -54033,6 +54452,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -54886,6 +55306,7 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -54927,6 +55348,7 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -54970,6 +55392,7 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55020,6 +55443,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask(i2 zeroext %__u, <2 x
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55070,6 +55494,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem(i2 zeroext %__u, <
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55122,6 +55547,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u,
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55175,6 +55601,7 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55222,6 +55649,7 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55271,6 +55699,7 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55327,6 +55756,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55383,6 +55813,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55441,6 +55872,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56260,6 +56692,7 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56303,6 +56736,7 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56348,6 +56782,7 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56401,6 +56836,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56454,6 +56890,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56509,6 +56946,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56564,6 +57002,7 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56613,6 +57052,7 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56664,6 +57104,7 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56723,6 +57164,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56782,6 +57224,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56843,6 +57286,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57146,6 +57590,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57217,6 +57662,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57289,6 +57735,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57364,6 +57811,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57439,6 +57887,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57515,6 +57964,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57647,6 +58097,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57723,6 +58174,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57800,6 +58252,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57880,6 +58333,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57960,6 +58414,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -58041,6 +58496,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/avx512vl-vec-test-testn.ll b/test/CodeGen/X86/avx512vl-vec-test-testn.ll
index f1919cb118c..32de0254efa 100644
--- a/test/CodeGen/X86/avx512vl-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512vl-vec-test-testn.ll
@@ -6,18 +6,14 @@
define zeroext i8 @TEST_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_test_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpneqq %xmm1, %xmm0, %k0
+; X86_64-NEXT: vptestmq %xmm0, %xmm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_test_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpneqq %xmm1, %xmm0, %k0
+; I386-NEXT: vptestmq %xmm0, %xmm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -33,18 +29,14 @@ entry:
define zeroext i8 @TEST_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_test_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpneqd %xmm1, %xmm0, %k0
+; X86_64-NEXT: vptestmd %xmm0, %xmm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_test_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpneqd %xmm1, %xmm0, %k0
+; I386-NEXT: vptestmd %xmm0, %xmm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -61,9 +53,7 @@ entry:
define zeroext i8 @TEST_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_test_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpneqq %ymm1, %ymm0, %k0
+; X86_64-NEXT: vptestmq %ymm0, %ymm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -71,9 +61,7 @@ define zeroext i8 @TEST_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) lo
;
; I386-LABEL: TEST_mm256_test_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpneqq %ymm1, %ymm0, %k0
+; I386-NEXT: vptestmq %ymm0, %ymm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -90,9 +78,7 @@ entry:
define zeroext i8 @TEST_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_test_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpneqd %ymm1, %ymm0, %k0
+; X86_64-NEXT: vptestmd %ymm0, %ymm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -100,9 +86,7 @@ define zeroext i8 @TEST_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) lo
;
; I386-LABEL: TEST_mm256_test_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpneqd %ymm1, %ymm0, %k0
+; I386-NEXT: vptestmd %ymm0, %ymm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -119,21 +103,17 @@ entry:
define zeroext i8 @TEST_mm_mask_test_epi64_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_mask_test_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpneqq %xmm1, %xmm0, %k0 {%k1}
+; X86_64-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_mask_test_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpneqq %xmm1, %xmm0, %k0 {%k1}
+; I386-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -152,21 +132,17 @@ entry:
define zeroext i8 @TEST_mm_mask_test_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_mask_test_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpneqd %xmm1, %xmm0, %k0 {%k1}
+; X86_64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_mask_test_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpneqd %xmm1, %xmm0, %k0 {%k1}
+; I386-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -187,10 +163,8 @@ entry:
define zeroext i8 @TEST_mm256_mask_test_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_mask_test_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpneqq %ymm1, %ymm0, %k0 {%k1}
+; X86_64-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -198,11 +172,9 @@ define zeroext i8 @TEST_mm256_mask_test_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x
;
; I386-LABEL: TEST_mm256_mask_test_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpneqq %ymm1, %ymm0, %k0 {%k1}
+; I386-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -222,10 +194,8 @@ entry:
define zeroext i8 @TEST_mm256_mask_test_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_mask_test_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 {%k1}
+; X86_64-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -233,11 +203,9 @@ define zeroext i8 @TEST_mm256_mask_test_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x
;
; I386-LABEL: TEST_mm256_mask_test_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 {%k1}
+; I386-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -256,18 +224,14 @@ entry:
define zeroext i8 @TEST_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_testn_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; X86_64-NEXT: vptestnmq %xmm0, %xmm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_testn_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; I386-NEXT: vptestnmq %xmm0, %xmm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -283,18 +247,14 @@ entry:
define zeroext i8 @TEST_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_testn_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; X86_64-NEXT: vptestnmd %xmm0, %xmm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_testn_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; I386-NEXT: vptestnmd %xmm0, %xmm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -311,9 +271,7 @@ entry:
define zeroext i8 @TEST_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_testn_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; X86_64-NEXT: vptestnmq %ymm0, %ymm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -321,9 +279,7 @@ define zeroext i8 @TEST_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) l
;
; I386-LABEL: TEST_mm256_testn_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; I386-NEXT: vptestnmq %ymm0, %ymm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -340,9 +296,7 @@ entry:
define zeroext i8 @TEST_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_testn_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; X86_64-NEXT: vptestnmd %ymm0, %ymm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -350,9 +304,7 @@ define zeroext i8 @TEST_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) l
;
; I386-LABEL: TEST_mm256_testn_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; I386-NEXT: vptestnmd %ymm0, %ymm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -369,21 +321,17 @@ entry:
define zeroext i8 @TEST_mm_mask_testn_epi64_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_mask_testn_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; X86_64-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_mask_testn_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; I386-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -402,21 +350,17 @@ entry:
define zeroext i8 @TEST_mm_mask_testn_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_mask_testn_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; X86_64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_mask_testn_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; I386-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -437,10 +381,8 @@ entry:
define zeroext i8 @TEST_mm256_mask_testn_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_mask_testn_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; X86_64-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -448,11 +390,9 @@ define zeroext i8 @TEST_mm256_mask_testn_epi64_mask(i8 %__U, <4 x i64> %__A, <4
;
; I386-LABEL: TEST_mm256_mask_testn_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; I386-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -472,10 +412,8 @@ entry:
define zeroext i8 @TEST_mm256_mask_testn_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_mask_testn_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
+; X86_64-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -483,11 +421,9 @@ define zeroext i8 @TEST_mm256_mask_testn_epi32_mask(i8 %__U, <4 x i64> %__A, <4
;
; I386-LABEL: TEST_mm256_mask_testn_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
+; I386-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll
new file mode 100644
index 00000000000..ab4cbeb8d5e
--- /dev/null
+++ b/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s
+
+define <2 x i64> @test_mm_broadcastmb_epi64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_mm_broadcastmb_epi64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %a to <4 x i32>
+ %1 = bitcast <2 x i64> %b to <4 x i32>
+ %2 = icmp eq <4 x i32> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <8 x i1> %3 to i8
+ %conv.i = zext i8 %4 to i64
+ %vecinit.i.i = insertelement <2 x i64> undef, i64 %conv.i, i32 0
+ %vecinit1.i.i = shufflevector <2 x i64> %vecinit.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
+ ret <2 x i64> %vecinit1.i.i
+}
+
+define <4 x i64> @test_mm256_broadcastmb_epi64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_mm256_broadcastmb_epi64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0
+; CHECK-NEXT: retq
+entry:
+ %0 = icmp eq <4 x i64> %a, %b
+ %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = bitcast <8 x i1> %1 to i8
+ %conv.i = zext i8 %2 to i64
+ %vecinit.i.i = insertelement <4 x i64> undef, i64 %conv.i, i32 0
+ %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
+ ret <4 x i64> %vecinit3.i.i
+}
+
+define <2 x i64> @test_mm_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm_broadcastmw_epi32:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %a to <16 x i32>
+ %1 = bitcast <8 x i64> %b to <16 x i32>
+ %2 = icmp eq <16 x i32> %0, %1
+ %3 = bitcast <16 x i1> %2 to i16
+ %conv.i = zext i16 %3 to i32
+ %vecinit.i.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0
+ %vecinit3.i.i = shufflevector <4 x i32> %vecinit.i.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %4 = bitcast <4 x i32> %vecinit3.i.i to <2 x i64>
+ ret <2 x i64> %4
+}
+
+define <4 x i64> @test_mm256_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm256_broadcastmw_epi32:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %a to <16 x i32>
+ %1 = bitcast <8 x i64> %b to <16 x i32>
+ %2 = icmp eq <16 x i32> %0, %1
+ %3 = bitcast <16 x i1> %2 to i16
+ %conv.i = zext i16 %3 to i32
+ %vecinit.i.i = insertelement <8 x i32> undef, i32 %conv.i, i32 0
+ %vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %4 = bitcast <8 x i32> %vecinit7.i.i to <4 x i64>
+ ret <4 x i64> %4
+}
+
+
diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll
index e197713c679..c48222000c6 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -439,6 +439,7 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
diff --git a/test/CodeGen/X86/bitcast-and-setcc-512.ll b/test/CodeGen/X86/bitcast-and-setcc-512.ll
index f6cfbbb4044..f5fe395eaf3 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-512.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-512.ll
@@ -594,6 +594,7 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -1239,6 +1240,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; AVX1-NEXT: orq %rcx, %rax
; AVX1-NEXT: movq %rbp, %rsp
; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: .cfi_def_cfa %rsp, 8
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1457,6 +1459,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; AVX2-NEXT: orq %rcx, %rax
; AVX2-NEXT: movq %rbp, %rsp
; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: .cfi_def_cfa %rsp, 8
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1499,6 +1502,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index 4ed55ac0919..1959000b859 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -321,11 +321,17 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) {
; AVX512-NEXT: vpinsrb $15, %r9d, %xmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: .cfi_def_cfa_offset 48
; AVX512-NEXT: popq %r12
+; AVX512-NEXT: .cfi_def_cfa_offset 40
; AVX512-NEXT: popq %r13
+; AVX512-NEXT: .cfi_def_cfa_offset 32
; AVX512-NEXT: popq %r14
+; AVX512-NEXT: .cfi_def_cfa_offset 24
; AVX512-NEXT: popq %r15
+; AVX512-NEXT: .cfi_def_cfa_offset 16
; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: .cfi_def_cfa_offset 8
; AVX512-NEXT: retq
%1 = bitcast i16 %a0 to <16 x i1>
%2 = zext <16 x i1> %1 to <16 x i8>
diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll
index ee2dac1d466..76160517546 100644
--- a/test/CodeGen/X86/bitcast-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-setcc-256.ll
@@ -204,6 +204,7 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
diff --git a/test/CodeGen/X86/bitcast-setcc-512.ll b/test/CodeGen/X86/bitcast-setcc-512.ll
index 2b73c6e16bd..ef981080bb3 100644
--- a/test/CodeGen/X86/bitcast-setcc-512.ll
+++ b/test/CodeGen/X86/bitcast-setcc-512.ll
@@ -203,6 +203,7 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) {
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -769,6 +770,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: orq %rcx, %rax
; AVX1-NEXT: movq %rbp, %rsp
; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: .cfi_def_cfa %rsp, 8
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -983,6 +985,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
; AVX2-NEXT: orq %rcx, %rax
; AVX2-NEXT: movq %rbp, %rsp
; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: .cfi_def_cfa %rsp, 8
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1021,6 +1024,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
diff --git a/test/CodeGen/X86/bool-vector.ll b/test/CodeGen/X86/bool-vector.ll
index eb40744c54d..692d992df76 100644
--- a/test/CodeGen/X86/bool-vector.ll
+++ b/test/CodeGen/X86/bool-vector.ll
@@ -93,6 +93,7 @@ define i32 @PR15215_good(<4 x i32> %input) {
; X32-NEXT: leal (%eax,%edx,4), %eax
; X32-NEXT: leal (%eax,%esi,8), %eax
; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X32-SSE2-LABEL: PR15215_good:
@@ -115,6 +116,7 @@ define i32 @PR15215_good(<4 x i32> %input) {
; X32-SSE2-NEXT: leal (%eax,%edx,4), %eax
; X32-SSE2-NEXT: leal (%eax,%esi,8), %eax
; X32-SSE2-NEXT: popl %esi
+; X32-SSE2-NEXT: .cfi_def_cfa_offset 4
; X32-SSE2-NEXT: retl
;
; X32-AVX2-LABEL: PR15215_good:
@@ -134,6 +136,7 @@ define i32 @PR15215_good(<4 x i32> %input) {
; X32-AVX2-NEXT: leal (%eax,%edx,4), %eax
; X32-AVX2-NEXT: leal (%eax,%esi,8), %eax
; X32-AVX2-NEXT: popl %esi
+; X32-AVX2-NEXT: .cfi_def_cfa_offset 4
; X32-AVX2-NEXT: retl
;
; X64-LABEL: PR15215_good:
diff --git a/test/CodeGen/X86/broadcastm-lowering.ll b/test/CodeGen/X86/broadcastm-lowering.ll
index 2a8236cf093..fc7b192c2f8 100644
--- a/test/CodeGen/X86/broadcastm-lowering.ll
+++ b/test/CodeGen/X86/broadcastm-lowering.ll
@@ -80,8 +80,7 @@ define <16 x i32> @test_mm512_epi32(<16 x i32> %a, <16 x i32> %b) {
; AVX512CD-LABEL: test_mm512_epi32:
; AVX512CD: # BB#0: # %entry
; AVX512CD-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; AVX512CD-NEXT: kmovw %k0, %eax
-; AVX512CD-NEXT: vpbroadcastd %eax, %zmm0
+; AVX512CD-NEXT: vpbroadcastmw2d %k0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512VLCDBW-LABEL: test_mm512_epi32:
@@ -110,9 +109,7 @@ define <8 x i64> @test_mm512_epi64(<8 x i32> %a, <8 x i32> %b) {
; AVX512CD-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; AVX512CD-NEXT: kmovw %k0, %eax
-; AVX512CD-NEXT: movzbl %al, %eax
-; AVX512CD-NEXT: vpbroadcastq %rax, %zmm0
+; AVX512CD-NEXT: vpbroadcastmb2q %k0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512VLCDBW-LABEL: test_mm512_epi64:
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index 82e133d2576..6f9abae6a71 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -247,10 +247,13 @@ define i32 @test12() ssp uwtable {
; CHECK-NEXT: # BB#1: # %T
; CHECK-NEXT: movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
; CHECK-NEXT: popq %rcx # encoding: [0x59]
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq # encoding: [0xc3]
; CHECK-NEXT: .LBB12_2: # %F
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movl $2, %eax # encoding: [0xb8,0x02,0x00,0x00,0x00]
; CHECK-NEXT: popq %rcx # encoding: [0x59]
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%tmp1 = call zeroext i1 @test12b()
diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll
index 9f7f8a97dc2..c5f03dbd5a3 100644
--- a/test/CodeGen/X86/combine-srl.ll
+++ b/test/CodeGen/X86/combine-srl.ll
@@ -175,7 +175,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) {
; SSE: # BB#0:
; SSE-NEXT: psrlq $48, %xmm1
; SSE-NEXT: psrlq $48, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: packusdw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_trunc_lshr0:
diff --git a/test/CodeGen/X86/compress_expand.ll b/test/CodeGen/X86/compress_expand.ll
index c6a1c07922e..9237544ea95 100644
--- a/test/CodeGen/X86/compress_expand.ll
+++ b/test/CodeGen/X86/compress_expand.ll
@@ -140,9 +140,7 @@ define void @test7(float* %base, <8 x float> %V, <8 x i1> %mask) {
; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
-; KNL-NEXT: vptestmq %zmm1, %zmm1, %k0
-; KNL-NEXT: kshiftlw $8, %k0, %k0
-; KNL-NEXT: kshiftrw $8, %k0, %k1
+; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1}
; KNL-NEXT: retq
call void @llvm.masked.compressstore.v8f32(<8 x float> %V, float* %base, <8 x i1> %mask)
diff --git a/test/CodeGen/X86/emutls-pie.ll b/test/CodeGen/X86/emutls-pie.ll
index 3c312a92669..f4561fcbd35 100644
--- a/test/CodeGen/X86/emutls-pie.ll
+++ b/test/CodeGen/X86/emutls-pie.ll
@@ -18,13 +18,16 @@ define i32 @my_get_xyz() {
; X32-NEXT: calll my_emutls_get_address@PLT
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $8, %esp
+; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
; X64-LABEL: my_get_xyz:
; X64: movq my_emutls_v_xyz@GOTPCREL(%rip), %rdi
; X64-NEXT: callq my_emutls_get_address@PLT
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
@@ -44,13 +47,16 @@ define i32 @f1() {
; X32-NEXT: calll __emutls_get_address@PLT
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $8, %esp
+; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
; X64-LABEL: f1:
; X64: leaq __emutls_v.i(%rip), %rdi
; X64-NEXT: callq __emutls_get_address@PLT
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/emutls.ll b/test/CodeGen/X86/emutls.ll
index 8c0ba903659..2321cd2fc28 100644
--- a/test/CodeGen/X86/emutls.ll
+++ b/test/CodeGen/X86/emutls.ll
@@ -16,12 +16,14 @@ define i32 @my_get_xyz() {
; X32-NEXT: calll my_emutls_get_address
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
; X64-LABEL: my_get_xyz:
; X64: movl $my_emutls_v_xyz, %edi
; X64-NEXT: callq my_emutls_get_address
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
@@ -45,12 +47,14 @@ define i32 @f1() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
; X64-LABEL: f1:
; X64: movl $__emutls_v.i1, %edi
; X64-NEXT: callq __emutls_get_address
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
@@ -63,11 +67,13 @@ define i32* @f2() {
; X32: movl $__emutls_v.i1, (%esp)
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
; X64-LABEL: f2:
; X64: movl $__emutls_v.i1, %edi
; X64-NEXT: callq __emutls_get_address
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
@@ -92,6 +98,7 @@ define i32* @f4() {
; X32: movl $__emutls_v.i2, (%esp)
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -116,6 +123,7 @@ define i32* @f6() {
; X32: movl $__emutls_v.i3, (%esp)
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -128,6 +136,7 @@ define i32 @f7() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -140,6 +149,7 @@ define i32* @f8() {
; X32: movl $__emutls_v.i4, (%esp)
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -152,6 +162,7 @@ define i32 @f9() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -164,6 +175,7 @@ define i32* @f10() {
; X32: movl $__emutls_v.i5, (%esp)
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -176,6 +188,7 @@ define i16 @f11() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movzwl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -189,6 +202,7 @@ define i32 @f12() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movswl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -203,6 +217,7 @@ define i8 @f13() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movb (%eax), %al
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -216,6 +231,7 @@ define i32 @f14() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movsbl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
diff --git a/test/CodeGen/X86/epilogue-cfi-fp.ll b/test/CodeGen/X86/epilogue-cfi-fp.ll
new file mode 100644
index 00000000000..c2fe1c7eaac
--- /dev/null
+++ b/test/CodeGen/X86/epilogue-cfi-fp.ll
@@ -0,0 +1,43 @@
+; RUN: llc -O0 %s -o - | FileCheck %s
+
+; ModuleID = 'epilogue-cfi-fp.c'
+source_filename = "epilogue-cfi-fp.c"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i686-pc-linux"
+
+; Function Attrs: noinline nounwind
+define i32 @foo(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m) #0 {
+
+; CHECK-LABEL: foo:
+; CHECK: popl %ebp
+; CHECK-NEXT: .cfi_def_cfa %esp, 4
+; CHECK-NEXT: retl
+
+entry:
+ %i.addr = alloca i32, align 4
+ %j.addr = alloca i32, align 4
+ %k.addr = alloca i32, align 4
+ %l.addr = alloca i32, align 4
+ %m.addr = alloca i32, align 4
+ store i32 %i, i32* %i.addr, align 4
+ store i32 %j, i32* %j.addr, align 4
+ store i32 %k, i32* %k.addr, align 4
+ store i32 %l, i32* %l.addr, align 4
+ store i32 %m, i32* %m.addr, align 4
+ ret i32 0
+}
+
+attributes #0 = { "no-frame-pointer-elim"="true" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (http://llvm.org/git/clang.git 3f8116e6a2815b1d5f3491493938d0c63c9f42c9) (http://llvm.org/git/llvm.git 4fde77f8f1a8e4482e69b6a7484bc7d1b99b3c0a)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "epilogue-cfi-fp.c", directory: "epilogue-dwarf/test")
+!2 = !{}
+!3 = !{i32 1, !"NumRegisterParameters", i32 0}
+!4 = !{i32 2, !"Dwarf Version", i32 4}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 7, !"PIC Level", i32 2}
+
diff --git a/test/CodeGen/X86/epilogue-cfi-no-fp.ll b/test/CodeGen/X86/epilogue-cfi-no-fp.ll
new file mode 100644
index 00000000000..79d6f478de8
--- /dev/null
+++ b/test/CodeGen/X86/epilogue-cfi-no-fp.ll
@@ -0,0 +1,46 @@
+; RUN: llc -O0 < %s | FileCheck %s
+
+; ModuleID = 'epilogue-cfi-no-fp.c'
+source_filename = "epilogue-cfi-no-fp.c"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i686-pc-linux"
+
+; Function Attrs: noinline nounwind
+define i32 @foo(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m) {
+; CHECK-LABEL: foo:
+; CHECK: addl $20, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 12
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: .cfi_def_cfa_offset 4
+; CHECK-NEXT: retl
+entry:
+ %i.addr = alloca i32, align 4
+ %j.addr = alloca i32, align 4
+ %k.addr = alloca i32, align 4
+ %l.addr = alloca i32, align 4
+ %m.addr = alloca i32, align 4
+ store i32 %i, i32* %i.addr, align 4
+ store i32 %j, i32* %j.addr, align 4
+ store i32 %k, i32* %k.addr, align 4
+ store i32 %l, i32* %l.addr, align 4
+ store i32 %m, i32* %m.addr, align 4
+ ret i32 0
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (http://llvm.org/git/clang.git 3f8116e6a2815b1d5f3491493938d0c63c9f42c9) (http://llvm.org/git/llvm.git 4fde77f8f1a8e4482e69b6a7484bc7d1b99b3c0a)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "epilogue-cfi-no-fp.c", directory: "epilogue-dwarf/test")
+!2 = !{}
+!3 = !{i32 1, !"NumRegisterParameters", i32 0}
+!4 = !{i32 2, !"Dwarf Version", i32 4}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 7, !"PIC Level", i32 2}
+
+
diff --git a/test/CodeGen/X86/f16c-intrinsics.ll b/test/CodeGen/X86/f16c-intrinsics.ll
index 712fe810d2a..64f8fd0ca8d 100644
--- a/test/CodeGen/X86/f16c-intrinsics.ll
+++ b/test/CodeGen/X86/f16c-intrinsics.ll
@@ -1,33 +1,81 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X32-AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X64-AVX512VL
define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
; X32-LABEL: test_x86_vcvtph2ps_128:
; X32: # BB#0:
-; X32-NEXT: vcvtph2ps %xmm0, %xmm0
-; X32-NEXT: retl
+; X32-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtph2ps_128:
; X64: # BB#0:
-; X64-NEXT: vcvtph2ps %xmm0, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_128:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
+define <4 x float> @test_x86_vcvtph2ps_128_m(<8 x i16>* nocapture %a) {
+; X32-LABEL: test_x86_vcvtph2ps_128_m:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vcvtph2ps_128_m:
+; X64: # BB#0:
+; X64-NEXT: vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_128_m:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128_m:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
+ %load = load <8 x i16>, <8 x i16>* %a
+ %res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %load) ; <<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) {
; X32-LABEL: test_x86_vcvtph2ps_256:
; X32: # BB#0:
-; X32-NEXT: vcvtph2ps %xmm0, %ymm0
-; X32-NEXT: retl
+; X32-NEXT: vcvtph2ps %xmm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtph2ps_256:
; X64: # BB#0:
-; X64-NEXT: vcvtph2ps %xmm0, %ymm0
-; X64-NEXT: retq
+; X64-NEXT: vcvtph2ps %xmm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_256:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_256:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -36,15 +84,26 @@ declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
define <8 x float> @test_x86_vcvtph2ps_256_m(<8 x i16>* nocapture %a) nounwind {
; X32-LABEL: test_x86_vcvtph2ps_256_m:
; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtph2ps (%eax), %ymm0
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtph2ps_256_m:
; X64: # BB#0:
-; X64-NEXT: vcvtph2ps (%rdi), %ymm0
-; X64-NEXT: retq
- %load = load <8 x i16>, <8 x i16>* %a, align 16
+; X64-NEXT: vcvtph2ps (%rdi), %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_256_m:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_256_m:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
+ %load = load <8 x i16>, <8 x i16>* %a
%res = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %load)
ret <8 x float> %res
}
@@ -52,13 +111,23 @@ define <8 x float> @test_x86_vcvtph2ps_256_m(<8 x i16>* nocapture %a) nounwind {
define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0) {
; X32-LABEL: test_x86_vcvtps2ph_128:
; X32: # BB#0:
-; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; X32-NEXT: retl
+; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128:
; X64: # BB#0:
-; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -67,15 +136,27 @@ declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly
define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) {
; X32-LABEL: test_x86_vcvtps2ph_256:
; X32: # BB#0:
-; X32-NEXT: vcvtps2ph $0, %ymm0, %xmm0
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
+; X32-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X32-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_256:
; X64: # BB#0:
-; X64-NEXT: vcvtps2ph $0, %ymm0, %xmm0
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_256:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_256:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -84,14 +165,25 @@ declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) {
; X32-LABEL: test_x86_vcvtps2ph_128_scalar:
; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtph2ps (%eax), %xmm0
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_scalar:
; X64: # BB#0:
-; X64-NEXT: vcvtph2ps (%rdi), %xmm0
-; X64-NEXT: retq
+; X64-NEXT: vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%load = load i64, i64* %ptr
%ins1 = insertelement <2 x i64> undef, i64 %load, i32 0
%ins2 = insertelement <2 x i64> %ins1, i64 0, i32 1
@@ -103,14 +195,25 @@ define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) {
define <4 x float> @test_x86_vcvtps2ph_128_scalar2(i64* %ptr) {
; X32-LABEL: test_x86_vcvtps2ph_128_scalar2:
; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtph2ps (%eax), %xmm0
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_scalar2:
; X64: # BB#0:
-; X64-NEXT: vcvtph2ps (%rdi), %xmm0
-; X64-NEXT: retq
+; X64-NEXT: vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar2:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar2:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%load = load i64, i64* %ptr
%ins = insertelement <2 x i64> undef, i64 %load, i32 0
%bc = bitcast <2 x i64> %ins to <8 x i16>
@@ -121,16 +224,29 @@ define <4 x float> @test_x86_vcvtps2ph_128_scalar2(i64* %ptr) {
define void @test_x86_vcvtps2ph_256_m(<8 x i16>* nocapture %d, <8 x float> %a) nounwind {
; X32-LABEL: test_x86_vcvtps2ph_256_m:
; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %ymm0, (%eax)
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %ymm0, (%eax) # encoding: [0xc4,0xe3,0x7d,0x1d,0x00,0x03]
+; X32-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_256_m:
; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %ymm0, (%rdi)
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $3, %ymm0, (%rdi) # encoding: [0xc4,0xe3,0x7d,0x1d,0x07,0x03]
+; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_256_m:
+; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0x00,0x03]
+; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_256_m:
+; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0x07,0x03]
+; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a, i32 3)
store <8 x i16> %0, <8 x i16>* %d, align 16
@@ -140,14 +256,31 @@ entry:
define void @test_x86_vcvtps2ph_128_m(<4 x i16>* nocapture %d, <4 x float> %a) nounwind {
; X32-LABEL: test_x86_vcvtps2ph_128_m:
; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax)
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_m:
; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi)
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m:
+; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03]
+; X32-AVX512VL-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0]
+; X32-AVX512VL-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X32-AVX512VL-NEXT: vpmovdw %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x33,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m:
+; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03]
+; X64-AVX512VL-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0]
+; X64-AVX512VL-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX512VL-NEXT: vpmovdw %xmm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x08,0x33,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a, i32 3)
%1 = shufflevector <8 x i16> %0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -158,14 +291,25 @@ entry:
define void @test_x86_vcvtps2ph_128_m2(double* nocapture %hf4x16, <4 x float> %f4x32) #0 {
; X32-LABEL: test_x86_vcvtps2ph_128_m2:
; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax)
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_m2:
; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi)
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m2:
+; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m2:
+; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %f4x32, i32 3)
%1 = bitcast <8 x i16> %0 to <2 x double>
@@ -177,14 +321,25 @@ entry:
define void @test_x86_vcvtps2ph_128_m3(i64* nocapture %hf4x16, <4 x float> %f4x32) #0 {
; X32-LABEL: test_x86_vcvtps2ph_128_m3:
; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax)
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_m3:
; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi)
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m3:
+; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m3:
+; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %f4x32, i32 3)
%1 = bitcast <8 x i16> %0 to <2 x i64>
diff --git a/test/CodeGen/X86/fast-isel-int-float-conversion.ll b/test/CodeGen/X86/fast-isel-int-float-conversion.ll
index 3e69710868b..57b50abab53 100644
--- a/test/CodeGen/X86/fast-isel-int-float-conversion.ll
+++ b/test/CodeGen/X86/fast-isel-int-float-conversion.ll
@@ -31,6 +31,7 @@ define double @int_to_double_rr(i32 %a) {
; SSE2_X86-NEXT: fldl (%esp)
; SSE2_X86-NEXT: movl %ebp, %esp
; SSE2_X86-NEXT: popl %ebp
+; SSE2_X86-NEXT: .cfi_def_cfa %esp, 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_double_rr:
@@ -47,6 +48,7 @@ define double @int_to_double_rr(i32 %a) {
; AVX_X86-NEXT: fldl (%esp)
; AVX_X86-NEXT: movl %ebp, %esp
; AVX_X86-NEXT: popl %ebp
+; AVX_X86-NEXT: .cfi_def_cfa %esp, 4
; AVX_X86-NEXT: retl
entry:
%0 = sitofp i32 %a to double
@@ -80,6 +82,7 @@ define double @int_to_double_rm(i32* %a) {
; SSE2_X86-NEXT: fldl (%esp)
; SSE2_X86-NEXT: movl %ebp, %esp
; SSE2_X86-NEXT: popl %ebp
+; SSE2_X86-NEXT: .cfi_def_cfa %esp, 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_double_rm:
@@ -97,6 +100,7 @@ define double @int_to_double_rm(i32* %a) {
; AVX_X86-NEXT: fldl (%esp)
; AVX_X86-NEXT: movl %ebp, %esp
; AVX_X86-NEXT: popl %ebp
+; AVX_X86-NEXT: .cfi_def_cfa %esp, 4
; AVX_X86-NEXT: retl
entry:
%0 = load i32, i32* %a
@@ -130,6 +134,7 @@ define double @int_to_double_rm_optsize(i32* %a) optsize {
; SSE2_X86-NEXT: fldl (%esp)
; SSE2_X86-NEXT: movl %ebp, %esp
; SSE2_X86-NEXT: popl %ebp
+; SSE2_X86-NEXT: .cfi_def_cfa %esp, 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_double_rm_optsize:
@@ -147,6 +152,7 @@ define double @int_to_double_rm_optsize(i32* %a) optsize {
; AVX_X86-NEXT: fldl (%esp)
; AVX_X86-NEXT: movl %ebp, %esp
; AVX_X86-NEXT: popl %ebp
+; AVX_X86-NEXT: .cfi_def_cfa %esp, 4
; AVX_X86-NEXT: retl
entry:
%0 = load i32, i32* %a
@@ -174,6 +180,7 @@ define float @int_to_float_rr(i32 %a) {
; SSE2_X86-NEXT: movss %xmm0, (%esp)
; SSE2_X86-NEXT: flds (%esp)
; SSE2_X86-NEXT: popl %eax
+; SSE2_X86-NEXT: .cfi_def_cfa_offset 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_float_rr:
@@ -184,6 +191,7 @@ define float @int_to_float_rr(i32 %a) {
; AVX_X86-NEXT: vmovss %xmm0, (%esp)
; AVX_X86-NEXT: flds (%esp)
; AVX_X86-NEXT: popl %eax
+; AVX_X86-NEXT: .cfi_def_cfa_offset 4
; AVX_X86-NEXT: retl
entry:
%0 = sitofp i32 %a to float
@@ -211,6 +219,7 @@ define float @int_to_float_rm(i32* %a) {
; SSE2_X86-NEXT: movss %xmm0, (%esp)
; SSE2_X86-NEXT: flds (%esp)
; SSE2_X86-NEXT: popl %eax
+; SSE2_X86-NEXT: .cfi_def_cfa_offset 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_float_rm:
@@ -222,6 +231,7 @@ define float @int_to_float_rm(i32* %a) {
; AVX_X86-NEXT: vmovss %xmm0, (%esp)
; AVX_X86-NEXT: flds (%esp)
; AVX_X86-NEXT: popl %eax
+; AVX_X86-NEXT: .cfi_def_cfa_offset 4
; AVX_X86-NEXT: retl
entry:
%0 = load i32, i32* %a
@@ -249,6 +259,7 @@ define float @int_to_float_rm_optsize(i32* %a) optsize {
; SSE2_X86-NEXT: movss %xmm0, (%esp)
; SSE2_X86-NEXT: flds (%esp)
; SSE2_X86-NEXT: popl %eax
+; SSE2_X86-NEXT: .cfi_def_cfa_offset 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_float_rm_optsize:
@@ -260,6 +271,7 @@ define float @int_to_float_rm_optsize(i32* %a) optsize {
; AVX_X86-NEXT: vmovss %xmm0, (%esp)
; AVX_X86-NEXT: flds (%esp)
; AVX_X86-NEXT: popl %eax
+; AVX_X86-NEXT: .cfi_def_cfa_offset 4
; AVX_X86-NEXT: retl
entry:
%0 = load i32, i32* %a
diff --git a/test/CodeGen/X86/fast-isel-store.ll b/test/CodeGen/X86/fast-isel-store.ll
index e359e620563..e2412e9c5c0 100644
--- a/test/CodeGen/X86/fast-isel-store.ll
+++ b/test/CodeGen/X86/fast-isel-store.ll
@@ -375,6 +375,7 @@ define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double
; SSE64-NEXT: movupd %xmm0, (%eax)
; SSE64-NEXT: movupd %xmm1, 16(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_4xf64:
@@ -413,6 +414,7 @@ define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4
; SSE64-NEXT: movapd %xmm0, (%eax)
; SSE64-NEXT: movapd %xmm1, 16(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_4xf64_aligned:
@@ -452,6 +454,7 @@ define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %va
; SSE64-NEXT: movups %xmm2, 32(%eax)
; SSE64-NEXT: movups %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xi32:
@@ -501,6 +504,7 @@ define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x
; SSE64-NEXT: movaps %xmm2, 32(%eax)
; SSE64-NEXT: movaps %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xi32_aligned:
@@ -550,6 +554,7 @@ define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x floa
; SSE64-NEXT: movups %xmm2, 32(%eax)
; SSE64-NEXT: movups %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xf32:
@@ -599,6 +604,7 @@ define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <1
; SSE64-NEXT: movaps %xmm2, 32(%eax)
; SSE64-NEXT: movaps %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xf32_aligned:
@@ -656,6 +662,7 @@ define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double
; SSE64-NEXT: movupd %xmm2, 32(%eax)
; SSE64-NEXT: movupd %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_8xf64:
@@ -682,6 +689,7 @@ define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double
; AVXONLY64-NEXT: vmovupd %ymm1, 32(%eax)
; AVXONLY64-NEXT: movl %ebp, %esp
; AVXONLY64-NEXT: popl %ebp
+; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4
; AVXONLY64-NEXT: retl
;
; AVX51232-LABEL: test_store_8xf64:
@@ -729,6 +737,7 @@ define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8
; SSE64-NEXT: movapd %xmm2, 32(%eax)
; SSE64-NEXT: movapd %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_8xf64_aligned:
@@ -755,6 +764,7 @@ define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8
; AVXONLY64-NEXT: vmovapd %ymm1, 32(%eax)
; AVXONLY64-NEXT: movl %ebp, %esp
; AVXONLY64-NEXT: popl %ebp
+; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4
; AVXONLY64-NEXT: retl
;
; AVX51232-LABEL: test_store_8xf64_aligned:
diff --git a/test/CodeGen/X86/fma-intrinsics-x86.ll b/test/CodeGen/X86/fma-intrinsics-x86.ll
index 68f39469a82..362864f72a9 100644
--- a/test/CodeGen/X86/fma-intrinsics-x86.ll
+++ b/test/CodeGen/X86/fma-intrinsics-x86.ll
@@ -1,29 +1,32 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
-; RUN: llc < %s -mtriple=x86_64-pc-windows -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX512VL
+; RUN: llc < %s -mtriple=x86_64-pc-windows -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma4,-fma -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
; VFMADD
define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6a,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -31,21 +34,27 @@ define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xa9,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_bac_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6a,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
@@ -54,20 +63,25 @@ declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float
define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6b,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -75,21 +89,27 @@ define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1,
define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_bac_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6b,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
@@ -98,20 +118,25 @@ declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x do
define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x68,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -120,20 +145,25 @@ declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float
define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x69,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -142,20 +172,25 @@ declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x do
define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x68,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -164,20 +199,25 @@ declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x f
define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x69,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -187,20 +227,25 @@ declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4
define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xab,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xab,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6e,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -208,21 +253,27 @@ define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xab,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_bac_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6e,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
@@ -231,20 +282,25 @@ declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float
define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6f,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -252,21 +308,27 @@ define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1,
define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xab,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_bac_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6f,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
@@ -275,20 +337,25 @@ declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x do
define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -297,20 +364,25 @@ declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float
define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -319,20 +391,25 @@ declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x do
define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x6c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -341,20 +418,25 @@ declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x f
define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x6d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -364,20 +446,25 @@ declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4
define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xad,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xad,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7a,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -385,21 +472,27 @@ define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xad,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_bac_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7a,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
@@ -408,20 +501,25 @@ declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7b,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -429,21 +527,27 @@ define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1,
define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xad,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_bac_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7b,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
@@ -452,20 +556,25 @@ declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x d
define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x78,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -474,20 +583,25 @@ declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x79,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -496,20 +610,25 @@ declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x d
define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x78,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -518,20 +637,25 @@ declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x
define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x79,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -541,20 +665,25 @@ declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4
define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7e,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -562,21 +691,27 @@ define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xaf,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_bac_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7e,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
@@ -585,20 +720,25 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7f,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -606,21 +746,27 @@ define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1,
define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_bac_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7f,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
@@ -629,20 +775,25 @@ declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x d
define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -651,20 +802,25 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -673,20 +829,25 @@ declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x d
define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x7c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -695,20 +856,25 @@ declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x
define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x7d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -718,20 +884,25 @@ declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4
define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -740,20 +911,25 @@ declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x fl
define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -762,20 +938,25 @@ declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x
define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -784,20 +965,25 @@ declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8
define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -807,20 +993,25 @@ declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>,
define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5e,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -829,20 +1020,25 @@ declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x fl
define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5f,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -851,20 +1047,25 @@ declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x
define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5e,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -873,20 +1074,25 @@ declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8
define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5f,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll
index ba80c839fdd..ee64790d1d9 100644
--- a/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll
+++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll
@@ -18,11 +18,15 @@ entry:
}
; CHECK-LABEL: noDebug
-; CHECK: addq $24, %rsp
-; CHECK: popq %rbx
-; CHECK-NEXT: popq %r14
-; CHECK-NEXT: retq
-
+; CHECK: addq $16, %rsp
+; CHECK-NEXT: .cfi_adjust_cfa_offset -16
+; CHECK-NEXT: addq $8, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
define void @withDebug() !dbg !18 {
entry:
@@ -42,9 +46,11 @@ entry:
; CHECK-LABEL: withDebug
; CHECK: callq printf
; CHECK: callq printf
-; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: addq $16, %rsp
; CHECK: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)
diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll
index f9ecf707810..de9d6bf93d6 100644
--- a/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll
+++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll
@@ -9,6 +9,7 @@ define i64 @fn1NoDebug(i64 %a) {
; CHECK-LABEL: fn1NoDebug
; CHECK: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: ret
define i64 @fn1WithDebug(i64 %a) !dbg !4 {
@@ -19,6 +20,7 @@ define i64 @fn1WithDebug(i64 %a) !dbg !4 {
; CHECK-LABEL: fn1WithDebug
; CHECK: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: ret
%struct.Buffer = type { i8, [63 x i8] }
@@ -33,6 +35,7 @@ define void @fn2NoDebug(%struct.Buffer* byval align 64 %p1) {
; CHECK-NOT: sub
; CHECK: mov
; CHECK-NEXT: pop
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: ret
define void @fn2WithDebug(%struct.Buffer* byval align 64 %p1) !dbg !8 {
@@ -46,6 +49,7 @@ define void @fn2WithDebug(%struct.Buffer* byval align 64 %p1) !dbg !8 {
; CHECK-NOT: sub
; CHECK: mov
; CHECK-NEXT: pop
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: ret
declare i64 @fn(i64, i64)
diff --git a/test/CodeGen/X86/haddsub-2.ll b/test/CodeGen/X86/haddsub-2.ll
index e32c7452b0c..7126fb233e6 100644
--- a/test/CodeGen/X86/haddsub-2.ll
+++ b/test/CodeGen/X86/haddsub-2.ll
@@ -724,11 +724,17 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE3-NEXT: popq %rbx
+; SSE3-NEXT: .cfi_def_cfa_offset 48
; SSE3-NEXT: popq %r12
+; SSE3-NEXT: .cfi_def_cfa_offset 40
; SSE3-NEXT: popq %r13
+; SSE3-NEXT: .cfi_def_cfa_offset 32
; SSE3-NEXT: popq %r14
+; SSE3-NEXT: .cfi_def_cfa_offset 24
; SSE3-NEXT: popq %r15
+; SSE3-NEXT: .cfi_def_cfa_offset 16
; SSE3-NEXT: popq %rbp
+; SSE3-NEXT: .cfi_def_cfa_offset 8
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_vphadd_w_test:
@@ -1351,11 +1357,17 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE3-NEXT: popq %rbx
+; SSE3-NEXT: .cfi_def_cfa_offset 48
; SSE3-NEXT: popq %r12
+; SSE3-NEXT: .cfi_def_cfa_offset 40
; SSE3-NEXT: popq %r13
+; SSE3-NEXT: .cfi_def_cfa_offset 32
; SSE3-NEXT: popq %r14
+; SSE3-NEXT: .cfi_def_cfa_offset 24
; SSE3-NEXT: popq %r15
+; SSE3-NEXT: .cfi_def_cfa_offset 16
; SSE3-NEXT: popq %rbp
+; SSE3-NEXT: .cfi_def_cfa_offset 8
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_hadd_w:
diff --git a/test/CodeGen/X86/hipe-cc64.ll b/test/CodeGen/X86/hipe-cc64.ll
index efe07cf6301..ce2d0e9c671 100644
--- a/test/CodeGen/X86/hipe-cc64.ll
+++ b/test/CodeGen/X86/hipe-cc64.ll
@@ -87,6 +87,7 @@ define cc 11 { i64, i64, i64 } @tailcaller(i64 %hp, i64 %p) #0 {
; CHECK-NEXT: movl $47, %ecx
; CHECK-NEXT: movl $63, %r8d
; CHECK-NEXT: popq %rax
+ ; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: jmp tailcallee
%ret = tail call cc11 { i64, i64, i64 } @tailcallee(i64 %hp, i64 %p, i64 15,
i64 31, i64 47, i64 63, i64 79) #1
diff --git a/test/CodeGen/X86/horizontal-reduce-smax.ll b/test/CodeGen/X86/horizontal-reduce-smax.ll
new file mode 100644
index 00000000000..8f5aac493b5
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-smax.ll
@@ -0,0 +1,1896 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp sgt <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp sgt <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v8i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v8i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp sgt <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v16i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v16i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp sgt <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm6
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm6, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm5
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT: movapd %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm9, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm6
+; X64-SSE2-NEXT: por %xmm0, %xmm6
+; X64-SSE2-NEXT: pand %xmm8, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm8
+; X64-SSE2-NEXT: por %xmm1, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm8, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm6
+; X64-SSE2-NEXT: pandn %xmm8, %xmm1
+; X64-SSE2-NEXT: por %xmm6, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm5
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT: movapd %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm5
+; X86-SSE2-NEXT: por %xmm1, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm4
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm4, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxsd %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxsd %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm5
+; X64-SSE2-NEXT: por %xmm1, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm4
+; X64-SSE2-NEXT: pandn %xmm5, %xmm0
+; X64-SSE2-NEXT: por %xmm4, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxsd %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxsd %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v32i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pmaxsw %xmm3, %xmm1
+; X86-SSE-NEXT: pmaxsw %xmm2, %xmm0
+; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v32i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pmaxsw %xmm3, %xmm1
+; X64-SSE-NEXT: pmaxsw %xmm2, %xmm0
+; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp sgt <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm5
+; X86-SSE2-NEXT: por %xmm1, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm4
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm4, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxsb %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X64-SSE2-NEXT: pcmpgtb %xmm3, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm5
+; X64-SSE2-NEXT: por %xmm1, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm4
+; X64-SSE2-NEXT: pandn %xmm5, %xmm0
+; X64-SSE2-NEXT: por %xmm4, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxsb %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp sgt <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp sgt <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/horizontal-reduce-smin.ll b/test/CodeGen/X86/horizontal-reduce-smin.ll
new file mode 100644
index 00000000000..6feb963426b
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-smin.ll
@@ -0,0 +1,1898 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp slt <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp slt <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v8i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v8i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp slt <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v16i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v16i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp slt <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm6, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm5
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movapd %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm5
+; X64-SSE2-NEXT: pxor %xmm9, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm9, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm9, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm9, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm9, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm9
+; X64-SSE2-NEXT: movdqa %xmm9, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm9
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm4, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm5
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movapd %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm4
+; X86-SSE2-NEXT: por %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm4, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminsd %xmm3, %xmm1
+; X86-SSE42-NEXT: pminsd %xmm2, %xmm0
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm4
+; X64-SSE2-NEXT: por %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm5
+; X64-SSE2-NEXT: pandn %xmm4, %xmm0
+; X64-SSE2-NEXT: por %xmm5, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminsd %xmm3, %xmm1
+; X64-SSE42-NEXT: pminsd %xmm2, %xmm0
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v32i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pminsw %xmm3, %xmm1
+; X86-SSE-NEXT: pminsw %xmm2, %xmm0
+; X86-SSE-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v32i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pminsw %xmm3, %xmm1
+; X64-SSE-NEXT: pminsw %xmm2, %xmm0
+; X64-SSE-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp slt <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm4
+; X86-SSE2-NEXT: por %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm4, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminsb %xmm3, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm2, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm4
+; X64-SSE2-NEXT: por %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm5
+; X64-SSE2-NEXT: pandn %xmm4, %xmm0
+; X64-SSE2-NEXT: por %xmm5, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminsb %xmm3, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm2, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp slt <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp slt <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/horizontal-reduce-umax.ll b/test/CodeGen/X86/horizontal-reduce-umax.ll
new file mode 100644
index 00000000000..ee9d8955cb5
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -0,0 +1,2203 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm2, %xmm3
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X86-AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm2, %xmm3
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp ugt <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm3, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm3, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ugt <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm3, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pxor %xmm1, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm1, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm3, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: movd %xmm3, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE42-NEXT: pxor %xmm3, %xmm4
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm2, %xmm3
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE42-NEXT: pxor %xmm3, %xmm4
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm2, %xmm3
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ugt <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm4, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm4, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movd %xmm3, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm4, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm3, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm4, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm3, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ugt <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm6
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm6, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm6 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm6, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE42-NEXT: pxor %xmm6, %xmm5
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm5
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm7
+; X86-SSE42-NEXT: pxor %xmm6, %xmm7
+; X86-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT: pxor %xmm6, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movapd %xmm3, %xmm1
+; X86-SSE42-NEXT: xorpd %xmm6, %xmm1
+; X86-SSE42-NEXT: movapd %xmm2, %xmm0
+; X86-SSE42-NEXT: xorpd %xmm6, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm6, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm6
+; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm9, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm6
+; X64-SSE2-NEXT: por %xmm0, %xmm6
+; X64-SSE2-NEXT: pand %xmm8, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm8
+; X64-SSE2-NEXT: por %xmm1, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm8, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm6
+; X64-SSE2-NEXT: pandn %xmm8, %xmm1
+; X64-SSE2-NEXT: por %xmm6, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm6, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm5
+; X64-SSE42-NEXT: pxor %xmm6, %xmm5
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm5
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm7
+; X64-SSE42-NEXT: pxor %xmm6, %xmm7
+; X64-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE42-NEXT: pxor %xmm6, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movapd %xmm3, %xmm1
+; X64-SSE42-NEXT: xorpd %xmm6, %xmm1
+; X64-SSE42-NEXT: movapd %xmm2, %xmm0
+; X64-SSE42-NEXT: xorpd %xmm6, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm6, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm6
+; X64-SSE42-NEXT: pcmpgtq %xmm6, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm7
+; X86-SSE2-NEXT: por %xmm0, %xmm7
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm7
+; X86-SSE2-NEXT: pandn %xmm6, %xmm1
+; X86-SSE2-NEXT: por %xmm7, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxud %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxud %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm7
+; X64-SSE2-NEXT: por %xmm0, %xmm7
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm7
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm7, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxud %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxud %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: pcmpgtw %xmm5, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtw %xmm5, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm7
+; X86-SSE2-NEXT: por %xmm0, %xmm7
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm7
+; X86-SSE2-NEXT: pandn %xmm6, %xmm1
+; X86-SSE2-NEXT: por %xmm7, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxuw %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: pcmpgtw %xmm5, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtw %xmm5, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm7
+; X64-SSE2-NEXT: por %xmm0, %xmm7
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm7
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm7, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxuw %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ugt <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pmaxub %xmm3, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm2, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxub %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pmaxub %xmm3, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm2, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxub %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ugt <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp ugt <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/horizontal-reduce-umin.ll b/test/CodeGen/X86/horizontal-reduce-umin.ll
new file mode 100644
index 00000000000..43369673042
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -0,0 +1,2207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE42-NEXT: pxor %xmm0, %xmm3
+; X86-SSE42-NEXT: pxor %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X86-AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE42-NEXT: pxor %xmm0, %xmm3
+; X64-SSE42-NEXT: pxor %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp ult <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ult <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm4, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm3, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pxor %xmm1, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm4, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm3, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE42-NEXT: pxor %xmm3, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm2, %xmm3
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm3
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE42-NEXT: pxor %xmm3, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm2, %xmm3
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm3
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ult <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm4, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm3, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm3, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm4, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm4
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm4
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ult <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm6, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm5
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: pxor %xmm4, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE42-NEXT: pxor %xmm4, %xmm6
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm6
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm7
+; X86-SSE42-NEXT: pxor %xmm4, %xmm7
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm4, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movdqa %xmm6, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2
+; X86-SSE42-NEXT: movapd %xmm2, %xmm1
+; X86-SSE42-NEXT: xorpd %xmm4, %xmm1
+; X86-SSE42-NEXT: movapd %xmm3, %xmm0
+; X86-SSE42-NEXT: xorpd %xmm4, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm4, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm4
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm5
+; X64-SSE2-NEXT: pxor %xmm9, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm9, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm9, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm9, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm9, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm9
+; X64-SSE2-NEXT: movdqa %xmm9, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm9
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm4, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm5
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: pxor %xmm4, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE42-NEXT: pxor %xmm4, %xmm6
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm6
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm7
+; X64-SSE42-NEXT: pxor %xmm4, %xmm7
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm4, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movdqa %xmm6, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2
+; X64-SSE42-NEXT: movapd %xmm2, %xmm1
+; X64-SSE42-NEXT: xorpd %xmm4, %xmm1
+; X64-SSE42-NEXT: movapd %xmm3, %xmm0
+; X64-SSE42-NEXT: xorpd %xmm4, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm4, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm4
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm7
+; X86-SSE2-NEXT: por %xmm1, %xmm7
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm5
+; X86-SSE2-NEXT: pandn %xmm7, %xmm1
+; X86-SSE2-NEXT: por %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm4
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: movd %xmm4, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminud %xmm3, %xmm1
+; X86-SSE42-NEXT: pminud %xmm2, %xmm0
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm7
+; X64-SSE2-NEXT: por %xmm1, %xmm7
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm7, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm4
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: movd %xmm4, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminud %xmm3, %xmm1
+; X64-SSE42-NEXT: pminud %xmm2, %xmm0
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtw %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtw %xmm6, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm7
+; X86-SSE2-NEXT: por %xmm1, %xmm7
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm5
+; X86-SSE2-NEXT: pandn %xmm7, %xmm1
+; X86-SSE2-NEXT: por %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm3, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm4
+; X86-SSE2-NEXT: por %xmm2, %xmm4
+; X86-SSE2-NEXT: movd %xmm4, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminuw %xmm3, %xmm1
+; X86-SSE42-NEXT: pminuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtw %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtw %xmm6, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm7
+; X64-SSE2-NEXT: por %xmm1, %xmm7
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm7, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm3, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm4
+; X64-SSE2-NEXT: por %xmm2, %xmm4
+; X64-SSE2-NEXT: movd %xmm4, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminuw %xmm3, %xmm1
+; X64-SSE42-NEXT: pminuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ult <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pminub %xmm3, %xmm1
+; X86-SSE2-NEXT: pminub %xmm2, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminub %xmm3, %xmm1
+; X86-SSE42-NEXT: pminub %xmm2, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pminub %xmm3, %xmm1
+; X64-SSE2-NEXT: pminub %xmm2, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminub %xmm3, %xmm1
+; X64-SSE42-NEXT: pminub %xmm2, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ult <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp ult <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index fd503aa6c6e..e3b25a539c1 100644
--- a/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -81,6 +81,7 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
; X86-NEXT: orl %edx, %eax
; X86-NEXT: movw %ax, (%ecx)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: i24_insert_bit:
diff --git a/test/CodeGen/X86/imul.ll b/test/CodeGen/X86/imul.ll
index e364b001f94..02782f72108 100644
--- a/test/CodeGen/X86/imul.ll
+++ b/test/CodeGen/X86/imul.ll
@@ -307,6 +307,7 @@ define i64 @test5(i64 %a) {
; X86-NEXT: subl %ecx, %edx
; X86-NEXT: subl %esi, %edx
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%tmp3 = mul i64 %a, -31
@@ -362,6 +363,7 @@ define i64 @test7(i64 %a) {
; X86-NEXT: subl %ecx, %edx
; X86-NEXT: subl %esi, %edx
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%tmp3 = mul i64 %a, -33
@@ -390,6 +392,7 @@ define i64 @testOverflow(i64 %a) {
; X86-NEXT: addl %esi, %edx
; X86-NEXT: subl {{[0-9]+}}(%esp), %edx
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%tmp3 = mul i64 %a, 9223372036854775807
diff --git a/test/CodeGen/X86/inline-asm-A-constraint.ll b/test/CodeGen/X86/inline-asm-A-constraint.ll
index 2ad011e88e0..7975b318eff 100644
--- a/test/CodeGen/X86/inline-asm-A-constraint.ll
+++ b/test/CodeGen/X86/inline-asm-A-constraint.ll
@@ -19,8 +19,7 @@ entry:
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1
ret { i64, i64 } %.fca.1.insert
}
-; CHECK: lock
-; CHECK-NEXT: cmpxchg16b
+; CHECK: lock cmpxchg16b
attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind }
diff --git a/test/CodeGen/X86/lea-opt-cse1.ll b/test/CodeGen/X86/lea-opt-cse1.ll
index 05b47690e81..4c9ec3e0d7a 100644
--- a/test/CodeGen/X86/lea-opt-cse1.ll
+++ b/test/CodeGen/X86/lea-opt-cse1.ll
@@ -30,6 +30,7 @@ define void @test_func(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr {
; X86-NEXT: leal 1(%edx,%ecx), %ecx
; X86-NEXT: movl %ecx, 16(%eax)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0
diff --git a/test/CodeGen/X86/lea-opt-cse2.ll b/test/CodeGen/X86/lea-opt-cse2.ll
index 865dd49a6e1..cee6f6792cb 100644
--- a/test/CodeGen/X86/lea-opt-cse2.ll
+++ b/test/CodeGen/X86/lea-opt-cse2.ll
@@ -46,7 +46,9 @@ define void @foo(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0 {
; X86-NEXT: leal 1(%esi,%edx), %ecx
; X86-NEXT: movl %ecx, 16(%eax)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
br label %loop
diff --git a/test/CodeGen/X86/lea-opt-cse3.ll b/test/CodeGen/X86/lea-opt-cse3.ll
index 87949b40d48..ed3aff98036 100644
--- a/test/CodeGen/X86/lea-opt-cse3.ll
+++ b/test/CodeGen/X86/lea-opt-cse3.ll
@@ -91,6 +91,7 @@ define i32 @foo1_mult_basic_blocks(i32 %a, i32 %b) local_unnamed_addr #0 {
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: .LBB2_2: # %exit
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%mul = shl i32 %b, 2
@@ -143,6 +144,7 @@ define i32 @foo1_mult_basic_blocks_illegal_scale(i32 %a, i32 %b) local_unnamed_a
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: .LBB3_2: # %exit
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%mul = shl i32 %b, 1
diff --git a/test/CodeGen/X86/lea-opt-cse4.ll b/test/CodeGen/X86/lea-opt-cse4.ll
index 31f31a73d44..d068180c39c 100644
--- a/test/CodeGen/X86/lea-opt-cse4.ll
+++ b/test/CodeGen/X86/lea-opt-cse4.ll
@@ -36,6 +36,7 @@ define void @foo(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0 {
; X86-NEXT: leal 1(%ecx,%edx), %ecx
; X86-NEXT: movl %ecx, 16(%eax)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0
@@ -110,7 +111,9 @@ define void @foo_loop(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: movl %edx, 16(%eax)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
br label %loop
diff --git a/test/CodeGen/X86/legalize-shift-64.ll b/test/CodeGen/X86/legalize-shift-64.ll
index ca4cfa5b805..7dff2c20d5a 100644
--- a/test/CodeGen/X86/legalize-shift-64.ll
+++ b/test/CodeGen/X86/legalize-shift-64.ll
@@ -117,9 +117,13 @@ define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) {
; CHECK-NEXT: movl %esi, 4(%eax)
; CHECK-NEXT: movl %edi, (%eax)
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: popl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 12
; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: popl %ebp
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl $4
%shl = shl <2 x i64> %A, %B
ret <2 x i64> %shl
@@ -160,6 +164,7 @@ define i32 @test6() {
; CHECK-NEXT: .LBB5_4: # %if.then
; CHECK-NEXT: movl %ebp, %esp
; CHECK-NEXT: popl %ebp
+; CHECK-NEXT: .cfi_def_cfa %esp, 4
; CHECK-NEXT: retl
%x = alloca i32, align 4
%t = alloca i64, align 8
diff --git a/test/CodeGen/X86/live-out-reg-info.ll b/test/CodeGen/X86/live-out-reg-info.ll
index b838065beea..170f73593f6 100644
--- a/test/CodeGen/X86/live-out-reg-info.ll
+++ b/test/CodeGen/X86/live-out-reg-info.ll
@@ -18,6 +18,7 @@ define void @foo(i32 %a) {
; CHECK-NEXT: callq qux
; CHECK-NEXT: .LBB0_2: # %false
; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%t0 = lshr i32 %a, 23
br label %next
diff --git a/test/CodeGen/X86/load-combine.ll b/test/CodeGen/X86/load-combine.ll
index d1f5f41ac7b..d46efc4b5ec 100644
--- a/test/CodeGen/X86/load-combine.ll
+++ b/test/CodeGen/X86/load-combine.ll
@@ -376,6 +376,7 @@ define i32 @load_i32_by_i8_bswap_uses(i32* %arg) {
; CHECK-NEXT: orl %ecx, %eax
; CHECK-NEXT: orl %edx, %eax
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_bswap_uses:
@@ -496,6 +497,7 @@ define i32 @load_i32_by_i8_bswap_store_in_between(i32* %arg, i32* %arg1) {
; CHECK-NEXT: movzbl 3(%ecx), %eax
; CHECK-NEXT: orl %edx, %eax
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_bswap_store_in_between:
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll
index 8983c3acb53..207175aae1a 100644
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -1057,9 +1057,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
; SKX: # BB#0:
; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX-NEXT: kshiftlb $6, %k0, %k0
-; SKX-NEXT: kshiftrb $6, %k0, %k1
+; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -1068,9 +1066,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
; SKX_32: # BB#0:
; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX_32-NEXT: kshiftlb $6, %k0, %k0
-; SKX_32-NEXT: kshiftrb $6, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
@@ -1105,9 +1101,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
; SKX: # BB#0:
; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX-NEXT: kshiftlb $6, %k0, %k0
-; SKX-NEXT: kshiftrb $6, %k0, %k1
+; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: vzeroupper
@@ -1117,9 +1111,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
; SKX_32: # BB#0:
; SKX_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX_32-NEXT: kshiftlb $6, %k0, %k0
-; SKX_32-NEXT: kshiftrb $6, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX_32-NEXT: vzeroupper
@@ -1165,9 +1157,7 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
; SKX: # BB#0:
; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vptestmq %xmm1, %xmm1, %k0
-; SKX-NEXT: kshiftlb $6, %k0, %k0
-; SKX-NEXT: kshiftrb $6, %k0, %k1
+; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
@@ -1176,9 +1166,7 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
; SKX_32: # BB#0:
; SKX_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k0
-; SKX_32-NEXT: kshiftlb $6, %k0, %k0
-; SKX_32-NEXT: kshiftrb $6, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
; SKX_32-NEXT: vmovaps %xmm2, %xmm0
@@ -1702,6 +1690,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: .cfi_def_cfa %esp, 4
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_gather_16i64:
@@ -1736,6 +1725,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; SKX_32-NEXT: retl
%res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
ret <16 x i64> %res
@@ -1819,6 +1809,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; KNL_32-NEXT: vmovapd %zmm2, %zmm0
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: .cfi_def_cfa %esp, 4
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_gather_16f64:
@@ -1853,6 +1844,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; SKX_32-NEXT: vmovapd %zmm2, %zmm0
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; SKX_32-NEXT: retl
%res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
ret <16 x double> %res
@@ -1934,6 +1926,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: .cfi_def_cfa %esp, 4
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
@@ -1967,6 +1960,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
@@ -2050,6 +2044,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: .cfi_def_cfa %esp, 4
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
@@ -2083,6 +2078,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
@@ -2127,6 +2123,7 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: .cfi_def_cfa %esp, 4
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_pr28312:
@@ -2154,6 +2151,7 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; SKX_32-NEXT: retl
%g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
%g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll
index 3e257f5fd85..f43e3f6f56e 100644
--- a/test/CodeGen/X86/masked_memop.ll
+++ b/test/CodeGen/X86/masked_memop.ll
@@ -285,9 +285,7 @@ define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k0
-; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512F-NEXT: retq
@@ -327,9 +325,7 @@ define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
; AVX512F: ## BB#0:
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k0
-; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512F-NEXT: retq
@@ -369,9 +365,7 @@ define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
; AVX512F: ## BB#0:
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k0
-; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512F-NEXT: retq
diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll
index 77d9fa69182..3f5eeba7055 100644
--- a/test/CodeGen/X86/memcmp-optsize.ll
+++ b/test/CodeGen/X86/memcmp-optsize.ll
@@ -156,36 +156,36 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length3_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzwl (%eax), %edx
-; X86-NEXT: cmpw (%ecx), %dx
-; X86-NEXT: jne .LBB5_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 2(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 2(%ecx), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %edx
+; X86-NEXT: cmpw (%eax), %dx
+; X86-NEXT: jne .LBB5_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movb 2(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 2(%eax), %dl
; X86-NEXT: je .LBB5_3
-; X86-NEXT: .LBB5_1: # %res_block
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: incl %eax
+; X86-NEXT: .LBB5_2: # %res_block
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: incl %ecx
; X86-NEXT: .LBB5_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length3_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: cmpw (%rsi), %ax
-; X64-NEXT: jne .LBB5_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB5_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movb 2(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 2(%rsi), %cl
; X64-NEXT: je .LBB5_3
-; X64-NEXT: .LBB5_1: # %res_block
+; X64-NEXT: .LBB5_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB5_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -314,36 +314,36 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length5_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB10_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 4(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 4(%ecx), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB10_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movb 4(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 4(%eax), %dl
; X86-NEXT: je .LBB10_3
-; X86-NEXT: .LBB10_1: # %res_block
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: incl %eax
+; X86-NEXT: .LBB10_2: # %res_block
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: incl %ecx
; X86-NEXT: .LBB10_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length5_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: cmpl (%rsi), %eax
-; X64-NEXT: jne .LBB10_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB10_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movb 4(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 4(%rsi), %cl
; X64-NEXT: je .LBB10_3
-; X64-NEXT: .LBB10_1: # %res_block
+; X64-NEXT: .LBB10_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB10_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -356,7 +356,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize {
define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length8:
-; X86: # BB#0: # %loadbb
+; X86: # BB#0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -365,8 +365,8 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: jne .LBB11_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB11_2
+; X86-NEXT: # BB#1: # %loadbb1
; X86-NEXT: movl 4(%esi), %ecx
; X86-NEXT: movl 4(%eax), %edx
; X86-NEXT: bswapl %ecx
@@ -374,7 +374,7 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: je .LBB11_3
-; X86-NEXT: .LBB11_1: # %res_block
+; X86-NEXT: .LBB11_2: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: setae %al
@@ -400,22 +400,22 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length8_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB12_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movl 4(%eax), %edx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpl 4(%ecx), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB12_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movl 4(%ecx), %edx
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpl 4(%eax), %edx
; X86-NEXT: je .LBB12_3
-; X86-NEXT: .LBB12_1: # %res_block
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: incl %eax
+; X86-NEXT: .LBB12_2: # %res_block
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: incl %ecx
; X86-NEXT: .LBB12_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: sete %al
; X86-NEXT: retl
;
@@ -432,15 +432,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize {
define i1 @length8_eq_const(i8* %X) nounwind optsize {
; X86-LABEL: length8_eq_const:
-; X86: # BB#0: # %loadbb
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130
-; X86-NEXT: jne .LBB13_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB13_2
+; X86-NEXT: # BB#1: # %loadbb1
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534
; X86-NEXT: je .LBB13_3
-; X86-NEXT: .LBB13_1: # %res_block
+; X86-NEXT: .LBB13_2: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: incl %eax
; X86-NEXT: .LBB13_3: # %endblock
@@ -473,16 +473,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length12_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: cmpq (%rsi), %rax
-; X64-NEXT: jne .LBB14_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB14_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpl 8(%rsi), %ecx
; X64-NEXT: je .LBB14_3
-; X64-NEXT: .LBB14_1: # %res_block
+; X64-NEXT: .LBB14_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB14_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -505,28 +505,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length12:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB15_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB15_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: movl 8(%rsi), %edx
; X64-NEXT: bswapl %ecx
; X64-NEXT: bswapl %edx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB15_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB15_1: # %res_block
+; X64-NEXT: je .LBB15_3
+; X64-NEXT: .LBB15_2: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB15_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
ret i32 %m
@@ -546,28 +545,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length16:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB16_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB16_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rcx
; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB16_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB16_1: # %res_block
+; X64-NEXT: je .LBB16_3
+; X64-NEXT: .LBB16_2: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB16_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
ret i32 %m
@@ -701,19 +699,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
; X86-NEXT: retl
;
; X64-SSE2-LABEL: length24_eq:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB20_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB20_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movq 16(%rdi), %rcx
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx
; X64-SSE2-NEXT: je .LBB20_3
-; X64-SSE2-NEXT: .LBB20_1: # %res_block
+; X64-SSE2-NEXT: .LBB20_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB20_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -721,18 +719,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length24_eq:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX2-NEXT: jne .LBB20_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB20_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: movq 16(%rdi), %rcx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx
; X64-AVX2-NEXT: je .LBB20_3
-; X64-AVX2-NEXT: .LBB20_1: # %res_block
+; X64-AVX2-NEXT: .LBB20_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB20_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
@@ -757,18 +755,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize {
; X86-NEXT: retl
;
; X64-SSE2-LABEL: length24_eq_const:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB21_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB21_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi)
; X64-SSE2-NEXT: je .LBB21_3
-; X64-SSE2-NEXT: .LBB21_1: # %res_block
+; X64-SSE2-NEXT: .LBB21_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB21_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -776,18 +774,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length24_eq_const:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX2-NEXT: jne .LBB21_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB21_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi)
; X64-AVX2-NEXT: je .LBB21_3
-; X64-AVX2-NEXT: .LBB21_1: # %res_block
+; X64-AVX2-NEXT: .LBB21_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB21_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
@@ -833,7 +831,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
; X86-NOSSE-NEXT: retl
;
; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2: # BB#0: # %loadbb
+; X86-SSE2: # BB#0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
@@ -841,8 +839,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X86-SSE2-NEXT: pmovmskb %xmm1, %edx
; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF
-; X86-SSE2-NEXT: jne .LBB23_1
-; X86-SSE2-NEXT: # BB#2: # %loadbb1
+; X86-SSE2-NEXT: jne .LBB23_2
+; X86-SSE2-NEXT: # BB#1: # %loadbb1
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
@@ -850,7 +848,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X86-SSE2-NEXT: je .LBB23_3
-; X86-SSE2-NEXT: .LBB23_1: # %res_block
+; X86-SSE2-NEXT: .LBB23_2: # %res_block
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: incl %eax
; X86-SSE2-NEXT: .LBB23_3: # %endblock
@@ -859,14 +857,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB23_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB23_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
@@ -874,7 +872,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-SSE2-NEXT: je .LBB23_3
-; X64-SSE2-NEXT: .LBB23_1: # %res_block
+; X64-SSE2-NEXT: .LBB23_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB23_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -909,21 +907,21 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize {
; X86-NOSSE-NEXT: retl
;
; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2: # BB#0: # %loadbb
+; X86-SSE2: # BB#0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
-; X86-SSE2-NEXT: jne .LBB24_1
-; X86-SSE2-NEXT: # BB#2: # %loadbb1
+; X86-SSE2-NEXT: jne .LBB24_2
+; X86-SSE2-NEXT: # BB#1: # %loadbb1
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X86-SSE2-NEXT: je .LBB24_3
-; X86-SSE2-NEXT: .LBB24_1: # %res_block
+; X86-SSE2-NEXT: .LBB24_2: # %res_block
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: incl %eax
; X86-SSE2-NEXT: .LBB24_3: # %endblock
@@ -932,20 +930,20 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize {
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB24_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB24_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-SSE2-NEXT: je .LBB24_3
-; X64-SSE2-NEXT: .LBB24_1: # %res_block
+; X64-SSE2-NEXT: .LBB24_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB24_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -1009,20 +1007,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length64_eq:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
; X64-AVX2-NEXT: cmpl $-1, %eax
-; X64-AVX2-NEXT: jne .LBB26_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB26_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: cmpl $-1, %ecx
; X64-AVX2-NEXT: je .LBB26_3
-; X64-AVX2-NEXT: .LBB26_1: # %res_block
+; X64-AVX2-NEXT: .LBB26_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB26_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
@@ -1059,20 +1057,20 @@ define i1 @length64_eq_const(i8* %X) nounwind optsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length64_eq_const:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
; X64-AVX2-NEXT: cmpl $-1, %eax
-; X64-AVX2-NEXT: jne .LBB27_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB27_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: cmpl $-1, %ecx
; X64-AVX2-NEXT: je .LBB27_3
-; X64-AVX2-NEXT: .LBB27_1: # %res_block
+; X64-AVX2-NEXT: .LBB27_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB27_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index 393e4c42d8b..84fd45b0a08 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -187,35 +187,35 @@ define i32 @length3(i8* %X, i8* %Y) nounwind {
define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length3_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzwl (%eax), %edx
-; X86-NEXT: cmpw (%ecx), %dx
-; X86-NEXT: jne .LBB7_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 2(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 2(%ecx), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %edx
+; X86-NEXT: cmpw (%eax), %dx
+; X86-NEXT: jne .LBB7_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movb 2(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 2(%eax), %dl
; X86-NEXT: je .LBB7_3
-; X86-NEXT: .LBB7_1: # %res_block
-; X86-NEXT: movl $1, %eax
+; X86-NEXT: .LBB7_2: # %res_block
+; X86-NEXT: movl $1, %ecx
; X86-NEXT: .LBB7_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length3_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: cmpw (%rsi), %ax
-; X64-NEXT: jne .LBB7_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB7_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movb 2(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 2(%rsi), %cl
; X64-NEXT: je .LBB7_3
-; X64-NEXT: .LBB7_1: # %res_block
+; X64-NEXT: .LBB7_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB7_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -344,35 +344,35 @@ define i32 @length5(i8* %X, i8* %Y) nounwind {
define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length5_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB12_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 4(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 4(%ecx), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB12_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movb 4(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 4(%eax), %dl
; X86-NEXT: je .LBB12_3
-; X86-NEXT: .LBB12_1: # %res_block
-; X86-NEXT: movl $1, %eax
+; X86-NEXT: .LBB12_2: # %res_block
+; X86-NEXT: movl $1, %ecx
; X86-NEXT: .LBB12_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length5_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: cmpl (%rsi), %eax
-; X64-NEXT: jne .LBB12_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB12_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movb 4(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 4(%rsi), %cl
; X64-NEXT: je .LBB12_3
-; X64-NEXT: .LBB12_1: # %res_block
+; X64-NEXT: .LBB12_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB12_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -385,7 +385,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
define i32 @length8(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length8:
-; X86: # BB#0: # %loadbb
+; X86: # BB#0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -394,23 +394,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: jne .LBB13_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB13_2
+; X86-NEXT: # BB#1: # %loadbb1
; X86-NEXT: movl 4(%esi), %ecx
; X86-NEXT: movl 4(%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: jne .LBB13_1
-; X86-NEXT: # BB#3: # %endblock
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-; X86-NEXT: .LBB13_1: # %res_block
+; X86-NEXT: je .LBB13_3
+; X86-NEXT: .LBB13_2: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: setae %al
; X86-NEXT: leal -1(%eax,%eax), %eax
+; X86-NEXT: .LBB13_3: # %endblock
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
@@ -431,21 +429,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length8_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB14_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movl 4(%eax), %edx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpl 4(%ecx), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB14_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movl 4(%ecx), %edx
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpl 4(%eax), %edx
; X86-NEXT: je .LBB14_3
-; X86-NEXT: .LBB14_1: # %res_block
-; X86-NEXT: movl $1, %eax
+; X86-NEXT: .LBB14_2: # %res_block
+; X86-NEXT: movl $1, %ecx
; X86-NEXT: .LBB14_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: sete %al
; X86-NEXT: retl
;
@@ -462,15 +460,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
define i1 @length8_eq_const(i8* %X) nounwind {
; X86-LABEL: length8_eq_const:
-; X86: # BB#0: # %loadbb
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130
-; X86-NEXT: jne .LBB15_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB15_2
+; X86-NEXT: # BB#1: # %loadbb1
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534
; X86-NEXT: je .LBB15_3
-; X86-NEXT: .LBB15_1: # %res_block
+; X86-NEXT: .LBB15_2: # %res_block
; X86-NEXT: movl $1, %eax
; X86-NEXT: .LBB15_3: # %endblock
; X86-NEXT: testl %eax, %eax
@@ -502,16 +500,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length12_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: cmpq (%rsi), %rax
-; X64-NEXT: jne .LBB16_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB16_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpl 8(%rsi), %ecx
; X64-NEXT: je .LBB16_3
-; X64-NEXT: .LBB16_1: # %res_block
+; X64-NEXT: .LBB16_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB16_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -534,28 +532,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length12:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB17_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB17_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: movl 8(%rsi), %edx
; X64-NEXT: bswapl %ecx
; X64-NEXT: bswapl %edx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB17_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB17_1: # %res_block
+; X64-NEXT: je .LBB17_3
+; X64-NEXT: .LBB17_2: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB17_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
ret i32 %m
@@ -575,28 +572,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length16:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB18_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB18_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rcx
; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB18_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB18_1: # %res_block
+; X64-NEXT: je .LBB18_3
+; X64-NEXT: .LBB18_2: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB18_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
ret i32 %m
@@ -754,19 +750,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind {
; X86-NEXT: retl
;
; X64-SSE2-LABEL: length24_eq:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB22_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB22_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movq 16(%rdi), %rcx
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx
; X64-SSE2-NEXT: je .LBB22_3
-; X64-SSE2-NEXT: .LBB22_1: # %res_block
+; X64-SSE2-NEXT: .LBB22_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB22_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -774,18 +770,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind {
; X64-SSE2-NEXT: retq
;
; X64-AVX-LABEL: length24_eq:
-; X64-AVX: # BB#0: # %loadbb
+; X64-AVX: # BB#0:
; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX-NEXT: jne .LBB22_1
-; X64-AVX-NEXT: # BB#2: # %loadbb1
+; X64-AVX-NEXT: jne .LBB22_2
+; X64-AVX-NEXT: # BB#1: # %loadbb1
; X64-AVX-NEXT: movq 16(%rdi), %rcx
; X64-AVX-NEXT: xorl %eax, %eax
; X64-AVX-NEXT: cmpq 16(%rsi), %rcx
; X64-AVX-NEXT: je .LBB22_3
-; X64-AVX-NEXT: .LBB22_1: # %res_block
+; X64-AVX-NEXT: .LBB22_2: # %res_block
; X64-AVX-NEXT: movl $1, %eax
; X64-AVX-NEXT: .LBB22_3: # %endblock
; X64-AVX-NEXT: testl %eax, %eax
@@ -810,18 +806,18 @@ define i1 @length24_eq_const(i8* %X) nounwind {
; X86-NEXT: retl
;
; X64-SSE2-LABEL: length24_eq_const:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB23_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB23_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi)
; X64-SSE2-NEXT: je .LBB23_3
-; X64-SSE2-NEXT: .LBB23_1: # %res_block
+; X64-SSE2-NEXT: .LBB23_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB23_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -829,18 +825,18 @@ define i1 @length24_eq_const(i8* %X) nounwind {
; X64-SSE2-NEXT: retq
;
; X64-AVX-LABEL: length24_eq_const:
-; X64-AVX: # BB#0: # %loadbb
+; X64-AVX: # BB#0:
; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX-NEXT: jne .LBB23_1
-; X64-AVX-NEXT: # BB#2: # %loadbb1
+; X64-AVX-NEXT: jne .LBB23_2
+; X64-AVX-NEXT: # BB#1: # %loadbb1
; X64-AVX-NEXT: xorl %eax, %eax
; X64-AVX-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
; X64-AVX-NEXT: cmpq %rcx, 16(%rdi)
; X64-AVX-NEXT: je .LBB23_3
-; X64-AVX-NEXT: .LBB23_1: # %res_block
+; X64-AVX-NEXT: .LBB23_2: # %res_block
; X64-AVX-NEXT: movl $1, %eax
; X64-AVX-NEXT: .LBB23_3: # %endblock
; X64-AVX-NEXT: testl %eax, %eax
@@ -898,7 +894,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2: # BB#0: # %loadbb
+; X86-SSE2: # BB#0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
@@ -906,8 +902,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X86-SSE2-NEXT: pmovmskb %xmm1, %edx
; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF
-; X86-SSE2-NEXT: jne .LBB25_1
-; X86-SSE2-NEXT: # BB#2: # %loadbb1
+; X86-SSE2-NEXT: jne .LBB25_2
+; X86-SSE2-NEXT: # BB#1: # %loadbb1
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
@@ -915,7 +911,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X86-SSE2-NEXT: je .LBB25_3
-; X86-SSE2-NEXT: .LBB25_1: # %res_block
+; X86-SSE2-NEXT: .LBB25_2: # %res_block
; X86-SSE2-NEXT: movl $1, %eax
; X86-SSE2-NEXT: .LBB25_3: # %endblock
; X86-SSE2-NEXT: testl %eax, %eax
@@ -923,14 +919,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB25_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB25_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
@@ -938,7 +934,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-SSE2-NEXT: je .LBB25_3
-; X64-SSE2-NEXT: .LBB25_1: # %res_block
+; X64-SSE2-NEXT: .LBB25_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB25_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -946,20 +942,20 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X64-SSE2-NEXT: retq
;
; X64-AVX1-LABEL: length32_eq:
-; X64-AVX1: # BB#0: # %loadbb
+; X64-AVX1: # BB#0:
; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX1-NEXT: jne .LBB25_1
-; X64-AVX1-NEXT: # BB#2: # %loadbb1
+; X64-AVX1-NEXT: jne .LBB25_2
+; X64-AVX1-NEXT: # BB#1: # %loadbb1
; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0
; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm0, %xmm0
; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx
; X64-AVX1-NEXT: xorl %eax, %eax
; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-AVX1-NEXT: je .LBB25_3
-; X64-AVX1-NEXT: .LBB25_1: # %res_block
+; X64-AVX1-NEXT: .LBB25_2: # %res_block
; X64-AVX1-NEXT: movl $1, %eax
; X64-AVX1-NEXT: .LBB25_3: # %endblock
; X64-AVX1-NEXT: testl %eax, %eax
@@ -1006,21 +1002,21 @@ define i1 @length32_eq_const(i8* %X) nounwind {
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2: # BB#0: # %loadbb
+; X86-SSE2: # BB#0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
-; X86-SSE2-NEXT: jne .LBB26_1
-; X86-SSE2-NEXT: # BB#2: # %loadbb1
+; X86-SSE2-NEXT: jne .LBB26_2
+; X86-SSE2-NEXT: # BB#1: # %loadbb1
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X86-SSE2-NEXT: je .LBB26_3
-; X86-SSE2-NEXT: .LBB26_1: # %res_block
+; X86-SSE2-NEXT: .LBB26_2: # %res_block
; X86-SSE2-NEXT: movl $1, %eax
; X86-SSE2-NEXT: .LBB26_3: # %endblock
; X86-SSE2-NEXT: testl %eax, %eax
@@ -1028,20 +1024,20 @@ define i1 @length32_eq_const(i8* %X) nounwind {
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB26_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB26_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-SSE2-NEXT: je .LBB26_3
-; X64-SSE2-NEXT: .LBB26_1: # %res_block
+; X64-SSE2-NEXT: .LBB26_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB26_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -1049,20 +1045,20 @@ define i1 @length32_eq_const(i8* %X) nounwind {
; X64-SSE2-NEXT: retq
;
; X64-AVX1-LABEL: length32_eq_const:
-; X64-AVX1: # BB#0: # %loadbb
+; X64-AVX1: # BB#0:
; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX1-NEXT: jne .LBB26_1
-; X64-AVX1-NEXT: # BB#2: # %loadbb1
+; X64-AVX1-NEXT: jne .LBB26_2
+; X64-AVX1-NEXT: # BB#1: # %loadbb1
; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0
; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx
; X64-AVX1-NEXT: xorl %eax, %eax
; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-AVX1-NEXT: je .LBB26_3
-; X64-AVX1-NEXT: .LBB26_1: # %res_block
+; X64-AVX1-NEXT: .LBB26_2: # %res_block
; X64-AVX1-NEXT: movl $1, %eax
; X64-AVX1-NEXT: .LBB26_3: # %endblock
; X64-AVX1-NEXT: testl %eax, %eax
@@ -1136,20 +1132,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind {
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: length64_eq:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
; X64-AVX2-NEXT: cmpl $-1, %eax
-; X64-AVX2-NEXT: jne .LBB28_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB28_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: cmpl $-1, %ecx
; X64-AVX2-NEXT: je .LBB28_3
-; X64-AVX2-NEXT: .LBB28_1: # %res_block
+; X64-AVX2-NEXT: .LBB28_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB28_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
@@ -1197,20 +1193,20 @@ define i1 @length64_eq_const(i8* %X) nounwind {
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: length64_eq_const:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
; X64-AVX2-NEXT: cmpl $-1, %eax
-; X64-AVX2-NEXT: jne .LBB29_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB29_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: cmpl $-1, %ecx
; X64-AVX2-NEXT: je .LBB29_3
-; X64-AVX2-NEXT: .LBB29_1: # %res_block
+; X64-AVX2-NEXT: .LBB29_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB29_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
diff --git a/test/CodeGen/X86/memset-nonzero.ll b/test/CodeGen/X86/memset-nonzero.ll
index f0a957c9417..98e09377ddb 100644
--- a/test/CodeGen/X86/memset-nonzero.ll
+++ b/test/CodeGen/X86/memset-nonzero.ll
@@ -148,6 +148,7 @@ define void @memset_256_nonzero_bytes(i8* %x) {
; SSE-NEXT: movl $256, %edx # imm = 0x100
; SSE-NEXT: callq memset
; SSE-NEXT: popq %rax
+; SSE-NEXT: .cfi_def_cfa_offset 8
; SSE-NEXT: retq
;
; SSE2FAST-LABEL: memset_256_nonzero_bytes:
diff --git a/test/CodeGen/X86/merge-consecutive-loads-128.ll b/test/CodeGen/X86/merge-consecutive-loads-128.ll
index e414f5554de..b909b7c403b 100644
--- a/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -72,7 +72,9 @@ define <2 x i64> @merge_2i64_i64_12(i64* %ptr) nounwind uwtable noinline ssp {
; X32-SSE1-NEXT: movl %esi, 4(%eax)
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_2i64_i64_12:
@@ -384,6 +386,7 @@ define <4 x i32> @merge_4i32_i32_23u5(i32* %ptr) nounwind uwtable noinline ssp {
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: movl %ecx, 12(%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_23u5:
@@ -435,7 +438,9 @@ define <4 x i32> @merge_4i32_i32_23u5_inc2(i32* %ptr) nounwind uwtable noinline
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: movl %ecx, 12(%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc2:
@@ -490,7 +495,9 @@ define <4 x i32> @merge_4i32_i32_23u5_inc3(i32* %ptr) nounwind uwtable noinline
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: movl %ecx, 12(%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc3:
@@ -649,7 +656,9 @@ define <4 x i32> @merge_4i32_i32_45zz_inc4(i32* %ptr) nounwind uwtable noinline
; X32-SSE1-NEXT: movl $0, 12(%eax)
; X32-SSE1-NEXT: movl $0, 8(%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc4:
@@ -701,7 +710,9 @@ define <4 x i32> @merge_4i32_i32_45zz_inc5(i32* %ptr) nounwind uwtable noinline
; X32-SSE1-NEXT: movl $0, 12(%eax)
; X32-SSE1-NEXT: movl $0, 8(%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc5:
@@ -751,7 +762,9 @@ define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline s
; X32-SSE1-NEXT: movl %esi, 6(%eax)
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_8i16_i16_23u567u9:
@@ -897,9 +910,13 @@ define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(i8* %ptr) nounwind uwtable noin
; X32-SSE1-NEXT: movl %esi, 3(%eax)
; X32-SSE1-NEXT: movw %bp, (%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 16
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
; X32-SSE1-NEXT: popl %ebx
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %ebp
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF:
@@ -1129,7 +1146,9 @@ define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinlin
; X32-SSE1-NEXT: movl %esi, 4(%eax)
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_2i64_i64_12_volatile:
diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll
index 051c8a710c8..ddcc383b65e 100644
--- a/test/CodeGen/X86/movtopush.ll
+++ b/test/CodeGen/X86/movtopush.ll
@@ -382,8 +382,10 @@ entry:
; LINUX: pushl $1
; LINUX: .cfi_adjust_cfa_offset 4
; LINUX: calll good
-; LINUX: addl $28, %esp
+; LINUX: addl $16, %esp
; LINUX: .cfi_adjust_cfa_offset -16
+; LINUX: addl $12, %esp
+; LINUX: .cfi_def_cfa_offset 4
; LINUX-NOT: add
; LINUX: retl
define void @pr27140() optsize {
diff --git a/test/CodeGen/X86/mul-constant-result.ll b/test/CodeGen/X86/mul-constant-result.ll
index 011b63ce726..f778397f889 100644
--- a/test/CodeGen/X86/mul-constant-result.ll
+++ b/test/CodeGen/X86/mul-constant-result.ll
@@ -34,84 +34,116 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: .LBB0_6:
; X86-NEXT: addl %eax, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_39:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: .LBB0_40:
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_7:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_8:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: shll $2, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_9:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_10:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_11:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (,%eax,8), %ecx
; X86-NEXT: jmp .LBB0_12
; X86-NEXT: .LBB0_13:
; X86-NEXT: shll $3, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_14:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_15:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_16:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_17:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: shll $2, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_18:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_19:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: jmp .LBB0_20
; X86-NEXT: .LBB0_21:
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_22:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: shll $4, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_23:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: shll $4, %ecx
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_24:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_25:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: shll $2, %ecx
; X86-NEXT: jmp .LBB0_12
@@ -119,20 +151,26 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: shll $2, %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_27:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_28:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: .LBB0_20:
; X86-NEXT: leal (%eax,%ecx,4), %ecx
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_29:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: shll $3, %ecx
; X86-NEXT: jmp .LBB0_12
@@ -140,13 +178,17 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: shll $3, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_31:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_32:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %ecx
; X86-NEXT: jmp .LBB0_12
@@ -154,21 +196,27 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_34:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %ecx
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_35:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %ecx
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_36:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: shll $5, %ecx
; X86-NEXT: subl %eax, %ecx
@@ -180,10 +228,13 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_38:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: shll $5, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-HSW-LABEL: mult:
@@ -857,8 +908,11 @@ define i32 @foo() local_unnamed_addr #0 {
; X86-NEXT: negl %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 12
; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %ebx
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-HSW-LABEL: foo:
@@ -1072,10 +1126,15 @@ define i32 @foo() local_unnamed_addr #0 {
; X64-HSW-NEXT: negl %ecx
; X64-HSW-NEXT: movl %ecx, %eax
; X64-HSW-NEXT: addq $8, %rsp
+; X64-HSW-NEXT: .cfi_def_cfa_offset 40
; X64-HSW-NEXT: popq %rbx
+; X64-HSW-NEXT: .cfi_def_cfa_offset 32
; X64-HSW-NEXT: popq %r14
+; X64-HSW-NEXT: .cfi_def_cfa_offset 24
; X64-HSW-NEXT: popq %r15
+; X64-HSW-NEXT: .cfi_def_cfa_offset 16
; X64-HSW-NEXT: popq %rbp
+; X64-HSW-NEXT: .cfi_def_cfa_offset 8
; X64-HSW-NEXT: retq
%1 = tail call i32 @mult(i32 1, i32 0)
%2 = icmp ne i32 %1, 1
diff --git a/test/CodeGen/X86/mul-i256.ll b/test/CodeGen/X86/mul-i256.ll
index 0a48ae761ec..1e05b95dda0 100644
--- a/test/CodeGen/X86/mul-i256.ll
+++ b/test/CodeGen/X86/mul-i256.ll
@@ -349,10 +349,15 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 {
; X32-NEXT: movl %eax, 24(%ecx)
; X32-NEXT: movl %edx, 28(%ecx)
; X32-NEXT: addl $88, %esp
+; X32-NEXT: .cfi_def_cfa_offset 20
; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 16
; X32-NEXT: popl %edi
+; X32-NEXT: .cfi_def_cfa_offset 12
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: popl %ebp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test:
@@ -421,8 +426,11 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 {
; X64-NEXT: movq %rax, 16(%r9)
; X64-NEXT: movq %rdx, 24(%r9)
; X64-NEXT: popq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 24
; X64-NEXT: popq %r14
+; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: popq %r15
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
%av = load i256, i256* %a
diff --git a/test/CodeGen/X86/mul128.ll b/test/CodeGen/X86/mul128.ll
index 70a6173a19f..0c11f17d8d1 100644
--- a/test/CodeGen/X86/mul128.ll
+++ b/test/CodeGen/X86/mul128.ll
@@ -86,10 +86,15 @@ define i128 @foo(i128 %t, i128 %u) {
; X86-NEXT: movl %edx, 12(%ecx)
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: addl $8, %esp
+; X86-NEXT: .cfi_def_cfa_offset 20
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 12
; X86-NEXT: popl %ebx
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl $4
%k = mul i128 %t, %u
ret i128 %k
diff --git a/test/CodeGen/X86/no-plt.ll b/test/CodeGen/X86/no-plt.ll
new file mode 100644
index 00000000000..d6383c2d7d1
--- /dev/null
+++ b/test/CodeGen/X86/no-plt.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \
+; RUN: | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu \
+; RUN: | FileCheck -check-prefix=X64 %s
+
+define i32 @main() #0 {
+; X64: callq *_Z3foov@GOTPCREL(%rip)
+; X64: callq _Z3barv
+; X64: callq _Z3bazv
+
+entry:
+ %retval = alloca i32, align 4
+ store i32 0, i32* %retval, align 4
+ %call1 = call i32 @_Z3foov()
+ %call2 = call i32 @_Z3barv()
+ %call3 = call i32 @_Z3bazv()
+ ret i32 0
+}
+
+; Function Attrs: nonlazybind
+declare i32 @_Z3foov() #1
+
+declare i32 @_Z3barv() #2
+
+; Function Attrs: nonlazybind
+declare hidden i32 @_Z3bazv() #3
+
+
+attributes #1 = { nonlazybind }
+attributes #3 = { nonlazybind }
diff --git a/test/CodeGen/X86/pop-stack-cleanup-msvc.ll b/test/CodeGen/X86/pop-stack-cleanup-msvc.ll
new file mode 100644
index 00000000000..6330d3de72f
--- /dev/null
+++ b/test/CodeGen/X86/pop-stack-cleanup-msvc.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s | FileCheck %s
+
+target triple = "i686--windows-msvc"
+
+declare { i8*, i32 } @param2_ret2(i32, i32)
+declare i32 @__CxxFrameHandler3(...)
+
+
+define void @test_reserved_regs() minsize optsize personality i32 (...)* @__CxxFrameHandler3 {
+; CHECK-LABEL: test_reserved_regs:
+; CHECK: calll _param2_ret2
+; CHECK-NEXT: popl %ecx
+; CHECK-NEXT: popl %edi
+start:
+ %s = alloca i64
+ store i64 4, i64* %s
+ %0 = invoke { i8*, i32 } @param2_ret2(i32 0, i32 1)
+ to label %out unwind label %cleanup
+
+out:
+ ret void
+
+cleanup:
+ %cp = cleanuppad within none []
+ cleanupret from %cp unwind to caller
+}
diff --git a/test/CodeGen/X86/pr21792.ll b/test/CodeGen/X86/pr21792.ll
index 74f6c5a361f..54eb1fc7272 100644
--- a/test/CodeGen/X86/pr21792.ll
+++ b/test/CodeGen/X86/pr21792.ll
@@ -28,6 +28,7 @@ define void @func(<4 x float> %vx) {
; CHECK-NEXT: leaq stuff+8(%r9), %r9
; CHECK-NEXT: callq toto
; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
entry:
%tmp2 = bitcast <4 x float> %vx to <2 x i64>
diff --git a/test/CodeGen/X86/pr29061.ll b/test/CodeGen/X86/pr29061.ll
index 0cbe75f9ad5..b62d082507d 100644
--- a/test/CodeGen/X86/pr29061.ll
+++ b/test/CodeGen/X86/pr29061.ll
@@ -15,6 +15,7 @@ define void @t1(i8 signext %c) {
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: popl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
entry:
tail call void asm sideeffect "", "{di},~{dirflag},~{fpsr},~{flags}"(i8 %c)
@@ -32,6 +33,7 @@ define void @t2(i8 signext %c) {
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
entry:
tail call void asm sideeffect "", "{si},~{dirflag},~{fpsr},~{flags}"(i8 %c)
diff --git a/test/CodeGen/X86/pr29112.ll b/test/CodeGen/X86/pr29112.ll
index cc670eeb978..d791936bd53 100644
--- a/test/CodeGen/X86/pr29112.ll
+++ b/test/CodeGen/X86/pr29112.ll
@@ -65,6 +65,7 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
; CHECK-NEXT: vaddps {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: addq $88, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
diff --git a/test/CodeGen/X86/pr30430.ll b/test/CodeGen/X86/pr30430.ll
index 0254c0940b8..06007a3a4cf 100644
--- a/test/CodeGen/X86/pr30430.ll
+++ b/test/CodeGen/X86/pr30430.ll
@@ -108,6 +108,7 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
; CHECK-NEXT: vmovss %xmm14, (%rsp) # 4-byte Spill
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: retq
entry:
%__A.addr.i = alloca float, align 4
diff --git a/test/CodeGen/X86/pr32241.ll b/test/CodeGen/X86/pr32241.ll
index f48fef5f7fb..02f3bb12291 100644
--- a/test/CodeGen/X86/pr32241.ll
+++ b/test/CodeGen/X86/pr32241.ll
@@ -50,7 +50,9 @@ define i32 @_Z3foov() {
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%esp)
; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: addl $16, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
entry:
%aa = alloca i16, align 2
diff --git a/test/CodeGen/X86/pr32256.ll b/test/CodeGen/X86/pr32256.ll
index f6e254aaad0..5b6126fbc76 100644
--- a/test/CodeGen/X86/pr32256.ll
+++ b/test/CodeGen/X86/pr32256.ll
@@ -27,6 +27,7 @@ define void @_Z1av() {
; CHECK-NEXT: andb $1, %al
; CHECK-NEXT: movb %al, {{[0-9]+}}(%esp)
; CHECK-NEXT: addl $2, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
entry:
%b = alloca i8, align 1
diff --git a/test/CodeGen/X86/pr32282.ll b/test/CodeGen/X86/pr32282.ll
index d6e6f6eb107..67a0332ac53 100644
--- a/test/CodeGen/X86/pr32282.ll
+++ b/test/CodeGen/X86/pr32282.ll
@@ -43,6 +43,7 @@ define void @foo() {
; X86-NEXT: orl %eax, %edx
; X86-NEXT: setne {{[0-9]+}}(%esp)
; X86-NEXT: popl %eax
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: foo:
diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll
index 11eb6968709..59be67f0579 100644
--- a/test/CodeGen/X86/pr32284.ll
+++ b/test/CodeGen/X86/pr32284.ll
@@ -71,6 +71,7 @@ define void @foo() {
; 686-O0-NEXT: movzbl %al, %ecx
; 686-O0-NEXT: movl %ecx, (%esp)
; 686-O0-NEXT: addl $8, %esp
+; 686-O0-NEXT: .cfi_def_cfa_offset 4
; 686-O0-NEXT: retl
;
; 686-LABEL: foo:
@@ -88,6 +89,7 @@ define void @foo() {
; 686-NEXT: setle %dl
; 686-NEXT: movl %edx, {{[0-9]+}}(%esp)
; 686-NEXT: addl $8, %esp
+; 686-NEXT: .cfi_def_cfa_offset 4
; 686-NEXT: retl
entry:
%a = alloca i8, align 1
@@ -232,10 +234,15 @@ define void @f1() {
; 686-O0-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
; 686-O0-NEXT: movl %esi, (%esp) # 4-byte Spill
; 686-O0-NEXT: addl $36, %esp
+; 686-O0-NEXT: .cfi_def_cfa_offset 20
; 686-O0-NEXT: popl %esi
+; 686-O0-NEXT: .cfi_def_cfa_offset 16
; 686-O0-NEXT: popl %edi
+; 686-O0-NEXT: .cfi_def_cfa_offset 12
; 686-O0-NEXT: popl %ebx
+; 686-O0-NEXT: .cfi_def_cfa_offset 8
; 686-O0-NEXT: popl %ebp
+; 686-O0-NEXT: .cfi_def_cfa_offset 4
; 686-O0-NEXT: retl
;
; 686-LABEL: f1:
@@ -277,8 +284,11 @@ define void @f1() {
; 686-NEXT: movl %eax, _ZN8struct_210member_2_0E
; 686-NEXT: movl $0, _ZN8struct_210member_2_0E+4
; 686-NEXT: addl $1, %esp
+; 686-NEXT: .cfi_def_cfa_offset 12
; 686-NEXT: popl %esi
+; 686-NEXT: .cfi_def_cfa_offset 8
; 686-NEXT: popl %edi
+; 686-NEXT: .cfi_def_cfa_offset 4
; 686-NEXT: retl
entry:
%a = alloca i8, align 1
@@ -392,8 +402,11 @@ define void @f2() {
; 686-O0-NEXT: movw %cx, %di
; 686-O0-NEXT: movw %di, (%eax)
; 686-O0-NEXT: addl $2, %esp
+; 686-O0-NEXT: .cfi_def_cfa_offset 12
; 686-O0-NEXT: popl %esi
+; 686-O0-NEXT: .cfi_def_cfa_offset 8
; 686-O0-NEXT: popl %edi
+; 686-O0-NEXT: .cfi_def_cfa_offset 4
; 686-O0-NEXT: retl
;
; 686-LABEL: f2:
@@ -414,6 +427,7 @@ define void @f2() {
; 686-NEXT: sete %dl
; 686-NEXT: movw %dx, (%eax)
; 686-NEXT: addl $2, %esp
+; 686-NEXT: .cfi_def_cfa_offset 4
; 686-NEXT: retl
entry:
%a = alloca i16, align 2
@@ -532,6 +546,7 @@ define void @f3() #0 {
; 686-O0-NEXT: popl %esi
; 686-O0-NEXT: popl %edi
; 686-O0-NEXT: popl %ebp
+; 686-O0-NEXT: .cfi_def_cfa %esp, 4
; 686-O0-NEXT: retl
;
; 686-LABEL: f3:
@@ -558,6 +573,7 @@ define void @f3() #0 {
; 686-NEXT: movl %ecx, var_46
; 686-NEXT: movl %ebp, %esp
; 686-NEXT: popl %ebp
+; 686-NEXT: .cfi_def_cfa %esp, 4
; 686-NEXT: retl
entry:
%a = alloca i64, align 8
diff --git a/test/CodeGen/X86/pr32329.ll b/test/CodeGen/X86/pr32329.ll
index f6bdade24c6..9d1bb90e824 100644
--- a/test/CodeGen/X86/pr32329.ll
+++ b/test/CodeGen/X86/pr32329.ll
@@ -57,9 +57,13 @@ define void @foo() local_unnamed_addr {
; X86-NEXT: imull %eax, %ebx
; X86-NEXT: movb %bl, var_218
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 12
; X86-NEXT: popl %ebx
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: foo:
diff --git a/test/CodeGen/X86/pr32345.ll b/test/CodeGen/X86/pr32345.ll
index f6802887e9e..2bdeca20731 100644
--- a/test/CodeGen/X86/pr32345.ll
+++ b/test/CodeGen/X86/pr32345.ll
@@ -84,6 +84,7 @@ define void @foo() {
; 6860-NEXT: popl %edi
; 6860-NEXT: popl %ebx
; 6860-NEXT: popl %ebp
+; 6860-NEXT: .cfi_def_cfa %esp, 4
; 6860-NEXT: retl
;
; X64-LABEL: foo:
@@ -127,6 +128,7 @@ define void @foo() {
; 686-NEXT: movb %dl, (%eax)
; 686-NEXT: movl %ebp, %esp
; 686-NEXT: popl %ebp
+; 686-NEXT: .cfi_def_cfa %esp, 4
; 686-NEXT: retl
bb:
%tmp = alloca i64, align 8
diff --git a/test/CodeGen/X86/pr32451.ll b/test/CodeGen/X86/pr32451.ll
index 67c0cb39f8c..5b7d1373d34 100644
--- a/test/CodeGen/X86/pr32451.ll
+++ b/test/CodeGen/X86/pr32451.ll
@@ -30,7 +30,9 @@ define i8** @japi1_convert_690(i8**, i8***, i32) {
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
; CHECK-NEXT: movl %eax, (%ecx)
; CHECK-NEXT: addl $16, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
top:
%3 = alloca i8***
diff --git a/test/CodeGen/X86/pr34088.ll b/test/CodeGen/X86/pr34088.ll
index 2049c5507c6..4d85722057f 100644
--- a/test/CodeGen/X86/pr34088.ll
+++ b/test/CodeGen/X86/pr34088.ll
@@ -27,6 +27,7 @@ define i32 @pr34088() local_unnamed_addr {
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %ebp, %esp
; CHECK-NEXT: popl %ebp
+; CHECK-NEXT: .cfi_def_cfa %esp, 4
; CHECK-NEXT: retl
entry:
%foo = alloca %struct.Foo, align 4
diff --git a/test/CodeGen/X86/pr34653.ll b/test/CodeGen/X86/pr34653.ll
new file mode 100644
index 00000000000..129dbcacc95
--- /dev/null
+++ b/test/CodeGen/X86/pr34653.ll
@@ -0,0 +1,210 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+avx512f -o - | FileCheck %s
+
+declare fastcc <38 x double> @test()
+
+define void @pr34653() {
+; CHECK-LABEL: pr34653:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-512, %rsp # imm = 0xFE00
+; CHECK-NEXT: subq $2048, %rsp # imm = 0x800
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: callq test
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT: vmovaps %xmm0, %xmm1
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm2
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vmovaps %xmm3, %xmm4
+; CHECK-NEXT: vmovaps %xmm2, %xmm5
+; CHECK-NEXT: vmovaps %xmm5, %xmm6
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm7
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm8
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm9
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm10
+; CHECK-NEXT: vextractf32x4 $3, %zmm10, %xmm11
+; CHECK-NEXT: vmovaps %xmm11, %xmm12
+; CHECK-NEXT: vextractf32x4 $2, %zmm10, %xmm13
+; CHECK-NEXT: vmovaps %xmm13, %xmm14
+; CHECK-NEXT: vmovaps %xmm10, %xmm15
+; CHECK-NEXT: vmovaps %xmm15, %xmm2
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $3, %zmm9, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $2, %zmm9, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm9, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $3, %zmm8, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $2, %zmm8, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm8, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $3, %zmm7, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $2, %zmm7, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm7, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm11 = xmm11[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm13[1,0]
+; CHECK-NEXT: # kill: %YMM10<def> %YMM10<kill> %ZMM10<kill>
+; CHECK-NEXT: vextractf128 $1, %ymm10, %xmm10
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm10, %xmm0
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm15 = xmm15[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: # kill: %YMM9<def> %YMM9<kill> %ZMM9<kill>
+; CHECK-NEXT: vextractf128 $1, %ymm9, %xmm9
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm9, %xmm0
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: # kill: %YMM8<def> %YMM8<kill> %ZMM8<kill>
+; CHECK-NEXT: vextractf128 $1, %ymm8, %xmm8
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm8, %xmm0
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: # kill: %YMM7<def> %YMM7<kill> %ZMM7<kill>
+; CHECK-NEXT: vextractf128 $1, %ymm7, %xmm7
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm7, %xmm0
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm8, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm13, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm1, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm14, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm2, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm4, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm9, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm10, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm15, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm11, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm3, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm6, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm5, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm12, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm7, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %v = call fastcc <38 x double> @test()
+ %v.0 = extractelement <38 x double> %v, i32 0
+ ret void
+}
+
diff --git a/test/CodeGen/X86/pr34657.ll b/test/CodeGen/X86/pr34657.ll
new file mode 100644
index 00000000000..a63bc2a08dd
--- /dev/null
+++ b/test/CodeGen/X86/pr34657.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw -o - | FileCheck %s
+
+define <112 x i8> @pr34657() local_unnamed_addr {
+; CHECK-LABEL: pr34657
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vmovups (%rax), %xmm0
+; CHECK-NEXT: vmovups (%rax), %ymm1
+; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; CHECK-NEXT: vmovups (%rax), %zmm2
+; CHECK-NEXT: vmovaps %ymm1, 64(%rdi)
+; CHECK-NEXT: vmovaps %zmm2, (%rdi)
+; CHECK-NEXT: vextractf32x4 $2, %zmm0, 96(%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %wide.vec51 = load <112 x i8>, <112 x i8>* undef, align 2
+ ret <112 x i8> %wide.vec51
+}
diff --git a/test/CodeGen/X86/pr9743.ll b/test/CodeGen/X86/pr9743.ll
index 73b3c7f835c..ac3d4575510 100644
--- a/test/CodeGen/X86/pr9743.ll
+++ b/test/CodeGen/X86/pr9743.ll
@@ -11,4 +11,5 @@ define void @f() {
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: ret
diff --git a/test/CodeGen/X86/push-cfi-debug.ll b/test/CodeGen/X86/push-cfi-debug.ll
index 7f438e306e4..01fa12e87d0 100644
--- a/test/CodeGen/X86/push-cfi-debug.ll
+++ b/test/CodeGen/X86/push-cfi-debug.ll
@@ -23,8 +23,10 @@ declare x86_stdcallcc void @stdfoo(i32, i32) #0
; CHECK: .cfi_adjust_cfa_offset 4
; CHECK: calll stdfoo
; CHECK: .cfi_adjust_cfa_offset -8
-; CHECK: addl $20, %esp
+; CHECK: addl $8, %esp
; CHECK: .cfi_adjust_cfa_offset -8
+; CHECK: addl $12, %esp
+; CHECK: .cfi_def_cfa_offset 4
define void @test1() #0 !dbg !4 {
entry:
tail call void @foo(i32 1, i32 2) #1, !dbg !10
diff --git a/test/CodeGen/X86/push-cfi-obj.ll b/test/CodeGen/X86/push-cfi-obj.ll
index 33291ec3318..2c9ec334027 100644
--- a/test/CodeGen/X86/push-cfi-obj.ll
+++ b/test/CodeGen/X86/push-cfi-obj.ll
@@ -12,7 +12,7 @@
; LINUX-NEXT: ]
; LINUX-NEXT: Address: 0x0
; LINUX-NEXT: Offset: 0x68
-; LINUX-NEXT: Size: 64
+; LINUX-NEXT: Size: 72
; LINUX-NEXT: Link: 0
; LINUX-NEXT: Info: 0
; LINUX-NEXT: AddressAlignment: 4
@@ -22,8 +22,9 @@
; LINUX-NEXT: SectionData (
; LINUX-NEXT: 0000: 1C000000 00000000 017A504C 5200017C |.........zPLR..||
; LINUX-NEXT: 0010: 08070000 00000000 1B0C0404 88010000 |................|
-; LINUX-NEXT: 0020: 1C000000 24000000 00000000 1D000000 |....$...........|
+; LINUX-NEXT: 0020: 24000000 24000000 00000000 1D000000 |$...$...........|
; LINUX-NEXT: 0030: 04000000 00410E08 8502420D 05432E10 |.....A....B..C..|
+; LINUX-NEXT: 0040: 540C0404 410C0508 |T...A...|
; LINUX-NEXT: )
declare i32 @__gxx_personality_v0(...)
@@ -35,7 +36,7 @@ entry:
to label %continue unwind label %cleanup
continue:
ret void
-cleanup:
+cleanup:
landingpad { i8*, i32 }
cleanup
ret void
diff --git a/test/CodeGen/X86/push-cfi.ll b/test/CodeGen/X86/push-cfi.ll
index 91e579a8391..44f8bf857c4 100644
--- a/test/CodeGen/X86/push-cfi.ll
+++ b/test/CodeGen/X86/push-cfi.ll
@@ -74,8 +74,9 @@ cleanup:
; LINUX-NEXT: pushl $1
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: call
-; LINUX-NEXT: addl $28, %esp
+; LINUX-NEXT: addl $16, %esp
; LINUX: .cfi_adjust_cfa_offset -16
+; LINUX: addl $12, %esp
; DARWIN-NOT: .cfi_escape
; DARWIN-NOT: pushl
define void @test2_nofp() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
index 0e9d149373b..296d165b3eb 100644
--- a/test/CodeGen/X86/recip-fastmath.ll
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -144,14 +144,14 @@ define float @f32_one_step(float %x) #1 {
;
; KNL-LABEL: f32_one_step:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: f32_one_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -257,7 +257,7 @@ define float @f32_two_step(float %x) #2 {
;
; KNL-LABEL: f32_two_step:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
@@ -268,7 +268,7 @@ define float @f32_two_step(float %x) #2 {
;
; SKX-LABEL: f32_two_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
@@ -416,7 +416,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
;
; SKX-LABEL: v4f32_one_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -533,7 +533,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
;
; SKX-LABEL: v4f32_two_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
@@ -691,7 +691,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
;
; SKX-LABEL: v8f32_one_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -821,7 +821,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
;
; SKX-LABEL: v8f32_two_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
diff --git a/test/CodeGen/X86/recip-fastmath2.ll b/test/CodeGen/X86/recip-fastmath2.ll
index a263e9d3b65..f6eeeec57f1 100644
--- a/test/CodeGen/X86/recip-fastmath2.ll
+++ b/test/CodeGen/X86/recip-fastmath2.ll
@@ -56,13 +56,13 @@ define float @f32_no_step_2(float %x) #3 {
;
; KNL-LABEL: f32_no_step_2:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: f32_no_step_2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast float 1234.0, %x
@@ -144,7 +144,7 @@ define float @f32_one_step_2(float %x) #1 {
;
; KNL-LABEL: f32_one_step_2:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
@@ -152,7 +152,7 @@ define float @f32_one_step_2(float %x) #1 {
;
; SKX-LABEL: f32_one_step_2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
@@ -243,7 +243,7 @@ define float @f32_one_step_2_divs(float %x) #1 {
;
; KNL-LABEL: f32_one_step_2_divs:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
@@ -252,7 +252,7 @@ define float @f32_one_step_2_divs(float %x) #1 {
;
; SKX-LABEL: f32_one_step_2_divs:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
@@ -368,7 +368,7 @@ define float @f32_two_step_2(float %x) #2 {
;
; KNL-LABEL: f32_two_step_2:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
@@ -380,7 +380,7 @@ define float @f32_two_step_2(float %x) #2 {
;
; SKX-LABEL: f32_two_step_2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
@@ -478,7 +478,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
;
; SKX-LABEL: v4f32_one_step2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
@@ -580,7 +580,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
;
; SKX-LABEL: v4f32_one_step_2_divs:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
@@ -708,7 +708,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
;
; SKX-LABEL: v4f32_two_step2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
@@ -814,7 +814,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
;
; SKX-LABEL: v8f32_one_step2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
@@ -925,7 +925,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
;
; SKX-LABEL: v8f32_one_step_2_divs:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [11:0.50]
@@ -1067,7 +1067,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
;
; SKX-LABEL: v8f32_two_step2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
@@ -1124,7 +1124,7 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
;
; SKX-LABEL: v8f32_no_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
@@ -1183,7 +1183,7 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
;
; SKX-LABEL: v8f32_no_step2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
diff --git a/test/CodeGen/X86/return-ext.ll b/test/CodeGen/X86/return-ext.ll
index ef160f43b4a..c66e518943a 100644
--- a/test/CodeGen/X86/return-ext.ll
+++ b/test/CodeGen/X86/return-ext.ll
@@ -106,6 +106,7 @@ entry:
; CHECK: call
; CHECK-NEXT: movzbl
; CHECK-NEXT: {{pop|add}}
+; CHECK-NEXT: .cfi_def_cfa_offset {{4|8}}
; CHECK-NEXT: ret
}
@@ -120,6 +121,7 @@ entry:
; CHECK: call
; CHECK-NEXT: movzbl
; CHECK-NEXT: {{pop|add}}
+; CHECK-NEXT: .cfi_def_cfa_offset {{4|8}}
; CHECK-NEXT: ret
}
@@ -134,5 +136,6 @@ entry:
; CHECK: call
; CHECK-NEXT: movzwl
; CHECK-NEXT: {{pop|add}}
+; CHECK-NEXT: .cfi_def_cfa_offset {{4|8}}
; CHECK-NEXT: ret
}
diff --git a/test/CodeGen/X86/rtm.ll b/test/CodeGen/X86/rtm.ll
index bd2d3e544bd..a1feeb5999b 100644
--- a/test/CodeGen/X86/rtm.ll
+++ b/test/CodeGen/X86/rtm.ll
@@ -75,6 +75,7 @@ define void @f2(i32 %x) nounwind uwtable {
; X64-NEXT: xabort $1
; X64-NEXT: callq f1
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
%x.addr = alloca i32, align 4
diff --git a/test/CodeGen/X86/schedule-x86_32.ll b/test/CodeGen/X86/schedule-x86_32.ll
new file mode 100644
index 00000000000..5dc06e61cc6
--- /dev/null
+++ b/test/CodeGen/X86/schedule-x86_32.ll
@@ -0,0 +1,348 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=i686 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i8 @test_aaa(i8 %a0) optsize {
+; GENERIC-LABEL: test_aaa:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aaa
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aaa:
+; ATOM: # BB#0:
+; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aaa
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aaa:
+; SLM: # BB#0:
+; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aaa
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aaa:
+; SANDY: # BB#0:
+; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aaa
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aaa:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aaa
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [5:0.50]
+;
+; BROADWELL-LABEL: test_aaa:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aaa
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aaa:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aaa
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aaa:
+; SKX: # BB#0:
+; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aaa
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aaa:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aaa
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aaa:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aaa
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i8 asm "aaa", "=r,r"(i8 %a0) nounwind
+ ret i8 %1
+}
+
+define i8 @test_aad(i16 %a0) optsize {
+; GENERIC-LABEL: test_aad:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aad
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aad:
+; ATOM: # BB#0:
+; ATOM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aad
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aad:
+; SLM: # BB#0:
+; SLM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aad
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aad:
+; SANDY: # BB#0:
+; SANDY-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aad
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aad:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aad
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [5:0.50]
+;
+; BROADWELL-LABEL: test_aad:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aad
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aad:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aad
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aad:
+; SKX: # BB#0:
+; SKX-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aad
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aad:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aad
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aad:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aad
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i8 asm "aad", "=r,r"(i16 %a0) nounwind
+ ret i8 %1
+}
+
+define i16 @test_aam(i8 %a0) optsize {
+; GENERIC-LABEL: test_aam:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aam
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aam:
+; ATOM: # BB#0:
+; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aam
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aam:
+; SLM: # BB#0:
+; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aam
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aam:
+; SANDY: # BB#0:
+; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aam
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aam:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aam
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [5:0.50]
+;
+; BROADWELL-LABEL: test_aam:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aam
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aam:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aam
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aam:
+; SKX: # BB#0:
+; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aam
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aam:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aam
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aam:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aam
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i16 asm "aam", "=r,r"(i8 %a0) nounwind
+ ret i16 %1
+}
+
+define i8 @test_aas(i8 %a0) optsize {
+; GENERIC-LABEL: test_aas:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aas
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aas:
+; ATOM: # BB#0:
+; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aas
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aas:
+; SLM: # BB#0:
+; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aas
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aas:
+; SANDY: # BB#0:
+; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aas
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aas:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aas
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [5:0.50]
+;
+; BROADWELL-LABEL: test_aas:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aas
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aas:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aas
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aas:
+; SKX: # BB#0:
+; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aas
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aas:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aas
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aas:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aas
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i8 asm "aas", "=r,r"(i8 %a0) nounwind
+ ret i8 %1
+}
diff --git a/test/CodeGen/X86/schedule-x86_64.ll b/test/CodeGen/X86/schedule-x86_64.ll
new file mode 100644
index 00000000000..1db8c8768bd
--- /dev/null
+++ b/test/CodeGen/X86/schedule-x86_64.ll
@@ -0,0 +1,737 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i16 @test_bsf16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_bsf16:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsfw %di, %ax
+; GENERIC-NEXT: bsfw (%rsi), %cx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsf16:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsfw %di, %ax
+; ATOM-NEXT: bsfw (%rsi), %cx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsf16:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsfw %di, %ax
+; SLM-NEXT: bsfw (%rsi), %cx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsf16:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsfw %di, %ax
+; SANDY-NEXT: bsfw (%rsi), %cx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsf16:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsfw %di, %ax
+; HASWELL-NEXT: bsfw (%rsi), %cx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsf16:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsfw %di, %ax
+; BROADWELL-NEXT: bsfw (%rsi), %cx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsf16:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsfw %di, %ax
+; SKYLAKE-NEXT: bsfw (%rsi), %cx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsf16:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsfw %di, %ax
+; SKX-NEXT: bsfw (%rsi), %cx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsf16:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsfw %di, %ax
+; BTVER2-NEXT: bsfw (%rsi), %cx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsf16:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsfw %di, %ax
+; ZNVER1-NEXT: bsfw (%rsi), %cx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i16, i16 } asm sideeffect "bsf $2, $0 \0A\09 bsf $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i16 %a0, i16* %a1)
+ %2 = extractvalue { i16, i16 } %1, 0
+ %3 = extractvalue { i16, i16 } %1, 1
+ %4 = or i16 %2, %3
+ ret i16 %4
+}
+define i32 @test_bsf32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_bsf32:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsfl %edi, %eax
+; GENERIC-NEXT: bsfl (%rsi), %ecx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsf32:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsfl %edi, %eax
+; ATOM-NEXT: bsfl (%rsi), %ecx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsf32:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsfl %edi, %eax
+; SLM-NEXT: bsfl (%rsi), %ecx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsf32:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsfl %edi, %eax
+; SANDY-NEXT: bsfl (%rsi), %ecx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsf32:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsfl %edi, %eax
+; HASWELL-NEXT: bsfl (%rsi), %ecx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsf32:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsfl %edi, %eax
+; BROADWELL-NEXT: bsfl (%rsi), %ecx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsf32:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsfl %edi, %eax
+; SKYLAKE-NEXT: bsfl (%rsi), %ecx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsf32:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsfl %edi, %eax
+; SKX-NEXT: bsfl (%rsi), %ecx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsf32:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsfl %edi, %eax
+; BTVER2-NEXT: bsfl (%rsi), %ecx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsf32:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsfl %edi, %eax
+; ZNVER1-NEXT: bsfl (%rsi), %ecx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i32, i32 } asm sideeffect "bsf $2, $0 \0A\09 bsf $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i32 %a0, i32* %a1)
+ %2 = extractvalue { i32, i32 } %1, 0
+ %3 = extractvalue { i32, i32 } %1, 1
+ %4 = or i32 %2, %3
+ ret i32 %4
+}
+define i64 @test_bsf64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_bsf64:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsfq %rdi, %rax
+; GENERIC-NEXT: bsfq (%rsi), %rcx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsf64:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsfq %rdi, %rax
+; ATOM-NEXT: bsfq (%rsi), %rcx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsf64:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsfq %rdi, %rax
+; SLM-NEXT: bsfq (%rsi), %rcx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsf64:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsfq %rdi, %rax
+; SANDY-NEXT: bsfq (%rsi), %rcx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsf64:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsfq %rdi, %rax
+; HASWELL-NEXT: bsfq (%rsi), %rcx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsf64:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsfq %rdi, %rax
+; BROADWELL-NEXT: bsfq (%rsi), %rcx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsf64:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsfq %rdi, %rax
+; SKYLAKE-NEXT: bsfq (%rsi), %rcx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsf64:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsfq %rdi, %rax
+; SKX-NEXT: bsfq (%rsi), %rcx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsf64:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsfq %rdi, %rax
+; BTVER2-NEXT: bsfq (%rsi), %rcx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsf64:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsfq %rdi, %rax
+; ZNVER1-NEXT: bsfq (%rsi), %rcx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i64, i64 } asm sideeffect "bsf $2, $0 \0A\09 bsf $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i64 %a0, i64* %a1)
+ %2 = extractvalue { i64, i64 } %1, 0
+ %3 = extractvalue { i64, i64 } %1, 1
+ %4 = or i64 %2, %3
+ ret i64 %4
+}
+
+define i16 @test_bsr16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_bsr16:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsrw %di, %ax
+; GENERIC-NEXT: bsrw (%rsi), %cx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsr16:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsrw %di, %ax
+; ATOM-NEXT: bsrw (%rsi), %cx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsr16:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsrw %di, %ax
+; SLM-NEXT: bsrw (%rsi), %cx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsr16:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsrw %di, %ax
+; SANDY-NEXT: bsrw (%rsi), %cx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsr16:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsrw %di, %ax
+; HASWELL-NEXT: bsrw (%rsi), %cx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsr16:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsrw %di, %ax
+; BROADWELL-NEXT: bsrw (%rsi), %cx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsr16:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsrw %di, %ax
+; SKYLAKE-NEXT: bsrw (%rsi), %cx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsr16:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsrw %di, %ax
+; SKX-NEXT: bsrw (%rsi), %cx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsr16:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsrw %di, %ax
+; BTVER2-NEXT: bsrw (%rsi), %cx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsr16:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsrw %di, %ax
+; ZNVER1-NEXT: bsrw (%rsi), %cx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i16, i16 } asm sideeffect "bsr $2, $0 \0A\09 bsr $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i16 %a0, i16* %a1)
+ %2 = extractvalue { i16, i16 } %1, 0
+ %3 = extractvalue { i16, i16 } %1, 1
+ %4 = or i16 %2, %3
+ ret i16 %4
+}
+define i32 @test_bsr32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_bsr32:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsrl %edi, %eax
+; GENERIC-NEXT: bsrl (%rsi), %ecx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsr32:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsrl %edi, %eax
+; ATOM-NEXT: bsrl (%rsi), %ecx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsr32:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsrl %edi, %eax
+; SLM-NEXT: bsrl (%rsi), %ecx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsr32:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsrl %edi, %eax
+; SANDY-NEXT: bsrl (%rsi), %ecx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsr32:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsrl %edi, %eax
+; HASWELL-NEXT: bsrl (%rsi), %ecx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsr32:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsrl %edi, %eax
+; BROADWELL-NEXT: bsrl (%rsi), %ecx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsr32:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsrl %edi, %eax
+; SKYLAKE-NEXT: bsrl (%rsi), %ecx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsr32:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsrl %edi, %eax
+; SKX-NEXT: bsrl (%rsi), %ecx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsr32:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsrl %edi, %eax
+; BTVER2-NEXT: bsrl (%rsi), %ecx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsr32:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsrl %edi, %eax
+; ZNVER1-NEXT: bsrl (%rsi), %ecx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i32, i32 } asm sideeffect "bsr $2, $0 \0A\09 bsr $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i32 %a0, i32* %a1)
+ %2 = extractvalue { i32, i32 } %1, 0
+ %3 = extractvalue { i32, i32 } %1, 1
+ %4 = or i32 %2, %3
+ ret i32 %4
+}
+define i64 @test_bsr64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_bsr64:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsrq %rdi, %rax
+; GENERIC-NEXT: bsrq (%rsi), %rcx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsr64:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsrq %rdi, %rax
+; ATOM-NEXT: bsrq (%rsi), %rcx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsr64:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsrq %rdi, %rax
+; SLM-NEXT: bsrq (%rsi), %rcx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsr64:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsrq %rdi, %rax
+; SANDY-NEXT: bsrq (%rsi), %rcx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsr64:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsrq %rdi, %rax
+; HASWELL-NEXT: bsrq (%rsi), %rcx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsr64:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsrq %rdi, %rax
+; BROADWELL-NEXT: bsrq (%rsi), %rcx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsr64:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsrq %rdi, %rax
+; SKYLAKE-NEXT: bsrq (%rsi), %rcx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsr64:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsrq %rdi, %rax
+; SKX-NEXT: bsrq (%rsi), %rcx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsr64:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsrq %rdi, %rax
+; BTVER2-NEXT: bsrq (%rsi), %rcx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsr64:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsrq %rdi, %rax
+; ZNVER1-NEXT: bsrq (%rsi), %rcx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i64, i64 } asm sideeffect "bsr $2, $0 \0A\09 bsr $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i64 %a0, i64* %a1)
+ %2 = extractvalue { i64, i64 } %1, 0
+ %3 = extractvalue { i64, i64 } %1, 1
+ %4 = or i64 %2, %3
+ ret i64 %4
+}
+
+define i32 @test_bswap32(i32 %a0) optsize {
+; GENERIC-LABEL: test_bswap32:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: bswapl %edi # sched: [2:1.00]
+; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bswap32:
+; ATOM: # BB#0:
+; ATOM-NEXT: bswapl %edi # sched: [1:1.00]
+; ATOM-NEXT: movl %edi, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bswap32:
+; SLM: # BB#0:
+; SLM-NEXT: bswapl %edi # sched: [1:0.50]
+; SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bswap32:
+; SANDY: # BB#0:
+; SANDY-NEXT: bswapl %edi # sched: [2:1.00]
+; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bswap32:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: bswapl %edi # sched: [2:0.50]
+; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bswap32:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: bswapl %edi # sched: [2:0.50]
+; BROADWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bswap32:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: bswapl %edi # sched: [2:0.50]
+; SKYLAKE-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bswap32:
+; SKX: # BB#0:
+; SKX-NEXT: bswapl %edi # sched: [2:0.50]
+; SKX-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bswap32:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: bswapl %edi # sched: [1:0.50]
+; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bswap32:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: bswapl %edi # sched: [1:1.00]
+; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = tail call i32 asm "bswap $0", "=r,0"(i32 %a0) nounwind
+ ret i32 %1
+}
+define i64 @test_bswap64(i64 %a0) optsize {
+; GENERIC-LABEL: test_bswap64:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: bswapq %rdi # sched: [2:1.00]
+; GENERIC-NEXT: movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bswap64:
+; ATOM: # BB#0:
+; ATOM-NEXT: bswapq %rdi # sched: [1:1.00]
+; ATOM-NEXT: movq %rdi, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bswap64:
+; SLM: # BB#0:
+; SLM-NEXT: bswapq %rdi # sched: [1:0.50]
+; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bswap64:
+; SANDY: # BB#0:
+; SANDY-NEXT: bswapq %rdi # sched: [2:1.00]
+; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bswap64:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: bswapq %rdi # sched: [2:0.50]
+; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bswap64:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: bswapq %rdi # sched: [2:0.50]
+; BROADWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bswap64:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: bswapq %rdi # sched: [2:0.50]
+; SKYLAKE-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bswap64:
+; SKX: # BB#0:
+; SKX-NEXT: bswapq %rdi # sched: [2:0.50]
+; SKX-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bswap64:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: bswapq %rdi # sched: [1:0.50]
+; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bswap64:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: bswapq %rdi # sched: [1:1.00]
+; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = tail call i64 asm "bswap $0", "=r,0"(i64 %a0) nounwind
+ ret i64 %1
+}
diff --git a/test/CodeGen/X86/select-mmx.ll b/test/CodeGen/X86/select-mmx.ll
index 795990e3c32..7ad8b6f1b9c 100644
--- a/test/CodeGen/X86/select-mmx.ll
+++ b/test/CodeGen/X86/select-mmx.ll
@@ -48,6 +48,7 @@ define i64 @test47(i64 %arg) {
; I32-NEXT: movl {{[0-9]+}}(%esp), %edx
; I32-NEXT: movl %ebp, %esp
; I32-NEXT: popl %ebp
+; I32-NEXT: .cfi_def_cfa %esp, 4
; I32-NEXT: retl
%cond = icmp eq i64 %arg, 0
%slct = select i1 %cond, x86_mmx bitcast (i64 7 to x86_mmx), x86_mmx bitcast (i64 0 to x86_mmx)
@@ -100,6 +101,7 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) {
; I32-NEXT: movl {{[0-9]+}}(%esp), %edx
; I32-NEXT: movl %ebp, %esp
; I32-NEXT: popl %ebp
+; I32-NEXT: .cfi_def_cfa %esp, 4
; I32-NEXT: retl
%cond = icmp eq i64 %arg, 0
%xmmx = bitcast i64 %x to x86_mmx
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index 52225397ef0..c3674639eab 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -15,7 +15,6 @@ define i32 @test1(%0* %p, %0* %q, i1 %r) nounwind {
; CHECK-NEXT: cmovneq %rdi, %rsi
; CHECK-NEXT: movl (%rsi), %eax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test1:
; MCU: # BB#0:
@@ -45,7 +44,7 @@ define i32 @test2() nounwind {
; GENERIC-NEXT: callq _return_false
; GENERIC-NEXT: xorl %ecx, %ecx
; GENERIC-NEXT: testb $1, %al
-; GENERIC-NEXT: movl $-480, %eax
+; GENERIC-NEXT: movl $-480, %eax ## imm = 0xFE20
; GENERIC-NEXT: cmovnel %ecx, %eax
; GENERIC-NEXT: shll $3, %eax
; GENERIC-NEXT: cmpl $32768, %eax ## imm = 0x8000
@@ -55,14 +54,13 @@ define i32 @test2() nounwind {
; GENERIC-NEXT: popq %rcx
; GENERIC-NEXT: retq
; GENERIC-NEXT: LBB1_1: ## %bb90
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test2:
; ATOM: ## BB#0: ## %entry
; ATOM-NEXT: pushq %rax
; ATOM-NEXT: callq _return_false
; ATOM-NEXT: xorl %ecx, %ecx
-; ATOM-NEXT: movl $-480, %edx
+; ATOM-NEXT: movl $-480, %edx ## imm = 0xFE20
; ATOM-NEXT: testb $1, %al
; ATOM-NEXT: cmovnel %ecx, %edx
; ATOM-NEXT: shll $3, %edx
@@ -73,17 +71,16 @@ define i32 @test2() nounwind {
; ATOM-NEXT: popq %rcx
; ATOM-NEXT: retq
; ATOM-NEXT: LBB1_1: ## %bb90
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test2:
; MCU: # BB#0: # %entry
; MCU-NEXT: calll return_false
-; MCU-NEXT: xorl %ecx, %ecx
+; MCU-NEXT: xorl %ecx, %ecx
; MCU-NEXT: testb $1, %al
; MCU-NEXT: jne .LBB1_2
; MCU-NEXT: # BB#1: # %entry
; MCU-NEXT: movl $-480, %ecx # imm = 0xFE20
-; MCU-NEXT: .LBB1_2:
+; MCU-NEXT: .LBB1_2: # %entry
; MCU-NEXT: shll $3, %ecx
; MCU-NEXT: cmpl $32768, %ecx # imm = 0x8000
; MCU-NEXT: jge .LBB1_3
@@ -116,7 +113,6 @@ define float @test3(i32 %x) nounwind readnone {
; CHECK-NEXT: leaq {{.*}}(%rip), %rcx
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test3:
; MCU: # BB#0: # %entry
@@ -140,7 +136,6 @@ define signext i8 @test4(i8* nocapture %P, double %F) nounwind readonly {
; CHECK-NEXT: seta %al
; CHECK-NEXT: movsbl (%rdi,%rax,4), %eax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test4:
; MCU: # BB#0: # %entry
@@ -175,7 +170,6 @@ define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind {
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; CHECK-NEXT: movd %xmm0, (%rsi)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test5:
; MCU: # BB#0:
@@ -211,7 +205,6 @@ define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
; CHECK-NEXT: mulps %xmm0, %xmm0
; CHECK-NEXT: movaps %xmm0, (%rsi)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test6:
; MCU: # BB#0:
@@ -283,7 +276,6 @@ define x86_fp80 @test7(i32 %tmp8) nounwind {
; CHECK-NEXT: leaq {{.*}}(%rip), %rcx
; CHECK-NEXT: fldt (%rax,%rcx)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test7:
; MCU: # BB#0:
@@ -333,7 +325,6 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
; GENERIC-NEXT: movq %xmm1, 16(%rsi)
; GENERIC-NEXT: movdqa %xmm0, (%rsi)
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test8:
; ATOM: ## BB#0:
@@ -366,7 +357,6 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
; ATOM-NEXT: movdqa %xmm0, (%rsi)
; ATOM-NEXT: movq %xmm1, 16(%rsi)
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test8:
; MCU: # BB#0:
@@ -456,7 +446,6 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; GENERIC-NEXT: sbbq %rax, %rax
; GENERIC-NEXT: orq %rsi, %rax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test9:
; ATOM: ## BB#0:
@@ -466,7 +455,6 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test9:
; MCU: # BB#0:
@@ -493,7 +481,6 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; GENERIC-NEXT: sbbq %rax, %rax
; GENERIC-NEXT: orq %rsi, %rax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test9a:
; ATOM: ## BB#0:
@@ -503,7 +490,6 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test9a:
; MCU: # BB#0:
@@ -528,7 +514,6 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; GENERIC-NEXT: sbbq %rax, %rax
; GENERIC-NEXT: orq %rsi, %rax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test9b:
; ATOM: ## BB#0:
@@ -538,7 +523,6 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test9b:
; MCU: # BB#0:
@@ -566,7 +550,6 @@ define i64 @test10(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; CHECK-NEXT: setne %al
; CHECK-NEXT: leaq -1(%rax,%rax), %rax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test10:
; MCU: # BB#0:
@@ -592,7 +575,6 @@ define i64 @test11(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; CHECK-NEXT: notq %rax
; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test11:
; MCU: # BB#0:
@@ -619,7 +601,6 @@ define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; CHECK-NEXT: notq %rax
; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test11a:
; MCU: # BB#0:
@@ -649,7 +630,6 @@ define noalias i8* @test12(i64 %count) nounwind ssp noredzone {
; GENERIC-NEXT: movq $-1, %rdi
; GENERIC-NEXT: cmovnoq %rax, %rdi
; GENERIC-NEXT: jmp __Znam ## TAILCALL
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test12:
; ATOM: ## BB#0: ## %entry
@@ -659,7 +639,6 @@ define noalias i8* @test12(i64 %count) nounwind ssp noredzone {
; ATOM-NEXT: movq $-1, %rdi
; ATOM-NEXT: cmovnoq %rax, %rdi
; ATOM-NEXT: jmp __Znam ## TAILCALL
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test12:
; MCU: # BB#0: # %entry
@@ -710,7 +689,6 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
; GENERIC-NEXT: cmpl %esi, %edi
; GENERIC-NEXT: sbbl %eax, %eax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test13:
; ATOM: ## BB#0:
@@ -721,7 +699,6 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test13:
; MCU: # BB#0:
@@ -741,7 +718,6 @@ define i32 @test14(i32 %a, i32 %b) nounwind {
; CHECK-NEXT: setae %al
; CHECK-NEXT: negl %eax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test14:
; MCU: # BB#0:
@@ -763,7 +739,6 @@ define i32 @test15(i32 %x) nounwind {
; GENERIC-NEXT: negl %edi
; GENERIC-NEXT: sbbl %eax, %eax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test15:
; ATOM: ## BB#0: ## %entry
@@ -774,7 +749,6 @@ define i32 @test15(i32 %x) nounwind {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test15:
; MCU: # BB#0: # %entry
@@ -826,7 +800,6 @@ define i16 @test17(i16 %x) nounwind {
; GENERIC-NEXT: sbbl %eax, %eax
; GENERIC-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test17:
; ATOM: ## BB#0: ## %entry
@@ -838,7 +811,6 @@ define i16 @test17(i16 %x) nounwind {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test17:
; MCU: # BB#0: # %entry
@@ -859,7 +831,6 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
; GENERIC-NEXT: cmovgel %edx, %esi
; GENERIC-NEXT: movl %esi, %eax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test18:
; ATOM: ## BB#0:
@@ -869,7 +840,6 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test18:
; MCU: # BB#0:
diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll
index 20c77a4a517..5ae2cc5f35c 100644
--- a/test/CodeGen/X86/setcc-lowering.ll
+++ b/test/CodeGen/X86/setcc-lowering.ll
@@ -23,10 +23,9 @@ define <8 x i16> @pr25080(<8 x i32> %a) {
;
; KNL-32-LABEL: pr25080:
; KNL-32: # BB#0: # %entry
-; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607]
-; KNL-32-NEXT: vpand %ymm1, %ymm0, %ymm0
-; KNL-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-32-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; KNL-32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-32-NEXT: vbroadcastss {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607]
+; KNL-32-NEXT: vptestnmd %zmm1, %zmm0, %k0
; KNL-32-NEXT: movb $15, %al
; KNL-32-NEXT: kmovw %eax, %k1
; KNL-32-NEXT: korw %k1, %k0, %k1
@@ -90,6 +89,7 @@ define void @pr26232(i64 %a, <16 x i1> %b) {
; KNL-32-NEXT: jne .LBB1_1
; KNL-32-NEXT: # BB#2: # %for_exit600
; KNL-32-NEXT: popl %esi
+; KNL-32-NEXT: .cfi_def_cfa_offset 4
; KNL-32-NEXT: retl
allocas:
br label %for_test11.preheader
diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll
index 79cf0f2c8f1..a2767205fe2 100644
--- a/test/CodeGen/X86/shrink_vmul.ll
+++ b/test/CodeGen/X86/shrink_vmul.ll
@@ -31,6 +31,7 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: movq %xmm1, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi8:
@@ -89,6 +90,7 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_4xi8:
@@ -148,6 +150,7 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_8xi8:
@@ -220,6 +223,7 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: movdqu %xmm4, 16(%esi,%ecx,4)
; X86-NEXT: movdqu %xmm3, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_16xi8:
@@ -288,6 +292,7 @@ define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: movq %xmm1, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi16:
@@ -342,6 +347,7 @@ define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_4xi16:
@@ -399,6 +405,7 @@ define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_8xi16:
@@ -469,6 +476,7 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6
; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_16xi16:
@@ -541,6 +549,7 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b,
; X86-NEXT: psrad $16, %xmm0
; X86-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi8_sext:
@@ -606,6 +615,7 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X86-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi8_sext_zext:
@@ -666,6 +676,7 @@ define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: movq %xmm1, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi16_sext:
@@ -733,6 +744,7 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X86-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi16_sext_zext:
@@ -813,6 +825,7 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %
; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_16xi16_sext:
diff --git a/test/CodeGen/X86/sse-intrinsics-x86.ll b/test/CodeGen/X86/sse-intrinsics-x86.ll
index f178e18a259..ca74ee5732d 100644
--- a/test/CodeGen/X86/sse-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse-intrinsics-x86.ll
@@ -401,15 +401,10 @@ define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) {
; SSE-NEXT: rcpps %xmm0, %xmm0 ## encoding: [0x0f,0x53,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; AVX2-LABEL: test_x86_sse_rcp_ps:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_rcp_ps:
-; SKX: ## BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4c,0xc0]
-; SKX-NEXT: retl ## encoding: [0xc3]
+; VCHECK-LABEL: test_x86_sse_rcp_ps:
+; VCHECK: ## BB#0:
+; VCHECK-NEXT: vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0]
+; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -438,15 +433,10 @@ define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) {
; SSE-NEXT: rsqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x52,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; AVX2-LABEL: test_x86_sse_rsqrt_ps:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_rsqrt_ps:
-; SKX: ## BB#0:
-; SKX-NEXT: vrsqrt14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4e,0xc0]
-; SKX-NEXT: retl ## encoding: [0xc3]
+; VCHECK-LABEL: test_x86_sse_rsqrt_ps:
+; VCHECK: ## BB#0:
+; VCHECK-NEXT: vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0]
+; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -475,10 +465,15 @@ define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
; SSE-NEXT: sqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse_sqrt_ps:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse_sqrt_ps:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse_sqrt_ps:
+; SKX: ## BB#0:
+; SKX-NEXT: vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -491,10 +486,15 @@ define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
; SSE-NEXT: sqrtss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse_sqrt_ss:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse_sqrt_ss:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse_sqrt_ss:
+; SKX: ## BB#0:
+; SKX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll
index b5c2bff4b8f..d3c995197e8 100644
--- a/test/CodeGen/X86/sse-schedule.ll
+++ b/test/CodeGen/X86/sse-schedule.ll
@@ -2547,8 +2547,8 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
;
; SKX-LABEL: test_rcpps:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm0 # sched: [4:1.00]
-; SKX-NEXT: vrcp14ps (%rdi), %xmm1 # sched: [10:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpps (%rdi), %xmm1 # sched: [10:1.00]
; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
@@ -2719,8 +2719,8 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
;
; SKX-LABEL: test_rsqrtps:
; SKX: # BB#0:
-; SKX-NEXT: vrsqrt14ps %xmm0, %xmm0 # sched: [4:1.00]
-; SKX-NEXT: vrsqrt14ps (%rdi), %xmm1 # sched: [10:1.00]
+; SKX-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [10:1.00]
; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index d4047faad9b..72c68c56638 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -1592,10 +1592,15 @@ define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
; SSE-NEXT: sqrtpd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse2_sqrt_pd:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse2_sqrt_pd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_sqrt_pd:
+; SKX: ## BB#0:
+; SKX-NEXT: vsqrtpd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1608,10 +1613,15 @@ define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
; SSE-NEXT: sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse2_sqrt_sd:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse2_sqrt_sd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_sqrt_sd:
+; SKX: ## BB#0:
+; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1637,7 +1647,7 @@ define <2 x double> @test_x86_sse2_sqrt_sd_vec_load(<2 x double>* %a0) {
; SKX: ## BB#0:
; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SKX-NEXT: vmovapd (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x00]
-; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
+; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%a1 = load <2 x double>, <2 x double>* %a0, align 16
%res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a1) ; <<2 x double>> [#uses=1]
diff --git a/test/CodeGen/X86/statepoint-call-lowering.ll b/test/CodeGen/X86/statepoint-call-lowering.ll
index bd2dd53b654..d80c87b99b6 100644
--- a/test/CodeGen/X86/statepoint-call-lowering.ll
+++ b/test/CodeGen/X86/statepoint-call-lowering.ll
@@ -83,6 +83,7 @@ define i1 @test_relocate(i32 addrspace(1)* %a) gc "statepoint-example" {
; CHECK: callq return_i1
; CHECK-NEXT: .Ltmp5:
; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
entry:
%safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a)
diff --git a/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll b/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
index b88ca03805f..90f2002e2d4 100644
--- a/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
+++ b/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
@@ -69,6 +69,7 @@ define i1 @test_relocate(i32 addrspace(1)* %a) gc "statepoint-example" {
; CHECK: callq return_i1
; CHECK-NEXT: .Ltmp4:
; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
entry:
%safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 1, i32 0, i32 0, i32 addrspace(1)* %a)
diff --git a/test/CodeGen/X86/statepoint-invoke.ll b/test/CodeGen/X86/statepoint-invoke.ll
index 784b932addc..5aa902546c1 100644
--- a/test/CodeGen/X86/statepoint-invoke.ll
+++ b/test/CodeGen/X86/statepoint-invoke.ll
@@ -142,6 +142,7 @@ normal_return:
; CHECK-LABEL: %normal_return
; CHECK: xorl %eax, %eax
; CHECK-NEXT: popq
+ ; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%null.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 13, i32 13)
%undef.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 14, i32 14)
@@ -169,6 +170,7 @@ entry:
normal_return:
; CHECK: leaq
; CHECK-NEXT: popq
+ ; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%aa.rel = call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %sp, i32 13, i32 13)
%aa.converted = bitcast i32 addrspace(1)* %aa.rel to i64 addrspace(1)*
@@ -177,6 +179,7 @@ normal_return:
exceptional_return:
; CHECK: movl $15
; CHECK-NEXT: popq
+ ; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%landing_pad = landingpad token
cleanup
diff --git a/test/CodeGen/X86/throws-cfi-fp.ll b/test/CodeGen/X86/throws-cfi-fp.ll
new file mode 100644
index 00000000000..bacd965054c
--- /dev/null
+++ b/test/CodeGen/X86/throws-cfi-fp.ll
@@ -0,0 +1,98 @@
+; RUN: llc %s -o - | FileCheck %s
+
+; ModuleID = 'throws-cfi-fp.cpp'
+source_filename = "throws-cfi-fp.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__clang_call_terminate = comdat any
+
+@_ZL11ShouldThrow = internal unnamed_addr global i1 false, align 1
+@_ZTIi = external constant i8*
+@str = private unnamed_addr constant [20 x i8] c"Threw an exception!\00"
+
+; Function Attrs: uwtable
+define void @_Z6throwsv() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+
+; CHECK-LABEL: _Z6throwsv:
+; CHECK: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB0_1:
+; CHECK-NEXT: .cfi_def_cfa %rbp, 16
+
+entry:
+ %.b5 = load i1, i1* @_ZL11ShouldThrow, align 1
+ br i1 %.b5, label %if.then, label %try.cont
+
+if.then: ; preds = %entry
+ %exception = tail call i8* @__cxa_allocate_exception(i64 4)
+ %0 = bitcast i8* %exception to i32*
+ store i32 1, i32* %0, align 16
+ invoke void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+ to label %unreachable unwind label %lpad
+
+lpad: ; preds = %if.then
+ %1 = landingpad { i8*, i32 }
+ catch i8* null
+ %2 = extractvalue { i8*, i32 } %1, 0
+ %3 = tail call i8* @__cxa_begin_catch(i8* %2)
+ %puts = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @str, i64 0, i64 0))
+ invoke void @__cxa_rethrow()
+ to label %unreachable unwind label %lpad1
+
+lpad1: ; preds = %lpad
+ %4 = landingpad { i8*, i32 }
+ cleanup
+ invoke void @__cxa_end_catch()
+ to label %eh.resume unwind label %terminate.lpad
+
+try.cont: ; preds = %entry
+ ret void
+
+eh.resume: ; preds = %lpad1
+ resume { i8*, i32 } %4
+
+terminate.lpad: ; preds = %lpad1
+ %5 = landingpad { i8*, i32 }
+ catch i8* null
+ %6 = extractvalue { i8*, i32 } %5, 0
+ tail call void @__clang_call_terminate(i8* %6)
+ unreachable
+
+unreachable: ; preds = %lpad, %if.then
+ unreachable
+}
+
+declare i8* @__cxa_allocate_exception(i64)
+
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_rethrow()
+
+declare void @__cxa_end_catch()
+
+; Function Attrs: noinline noreturn nounwind
+declare void @__clang_call_terminate(i8*)
+
+declare void @_ZSt9terminatev()
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly)
+
+attributes #0 = { "no-frame-pointer-elim"="true" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!8, !9, !10}
+
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 6.0.0 (https://github.com/llvm-mirror/clang.git 316ebefb7fff8ad324a08a694347500b6cd7c95f) (https://github.com/llvm-mirror/llvm.git dcae9be81fc17cdfbe989402354d3c8ecd0a2c79)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "throws-cfi-fp.cpp", directory: "epilogue-dwarf/test")
+!4 = !{}
+!5 = !{}
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
diff --git a/test/CodeGen/X86/throws-cfi-no-fp.ll b/test/CodeGen/X86/throws-cfi-no-fp.ll
new file mode 100644
index 00000000000..1483e6b8483
--- /dev/null
+++ b/test/CodeGen/X86/throws-cfi-no-fp.ll
@@ -0,0 +1,97 @@
+; RUN: llc %s -o - | FileCheck %s
+
+; ModuleID = 'throws-cfi-no-fp.cpp'
+source_filename = "throws-cfi-no-fp.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__clang_call_terminate = comdat any
+
+@_ZL11ShouldThrow = internal unnamed_addr global i1 false, align 1
+@_ZTIi = external constant i8*
+@str = private unnamed_addr constant [20 x i8] c"Threw an exception!\00"
+
+; Function Attrs: uwtable
+define void @_Z6throwsv() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+
+; CHECK-LABEL: _Z6throwsv:
+; CHECK: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB0_1:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+
+entry:
+ %.b5 = load i1, i1* @_ZL11ShouldThrow, align 1
+ br i1 %.b5, label %if.then, label %try.cont
+
+if.then: ; preds = %entry
+ %exception = tail call i8* @__cxa_allocate_exception(i64 4)
+ %0 = bitcast i8* %exception to i32*
+ store i32 1, i32* %0, align 16
+ invoke void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+ to label %unreachable unwind label %lpad
+
+lpad: ; preds = %if.then
+ %1 = landingpad { i8*, i32 }
+ catch i8* null
+ %2 = extractvalue { i8*, i32 } %1, 0
+ %3 = tail call i8* @__cxa_begin_catch(i8* %2)
+ %puts = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @str, i64 0, i64 0))
+ invoke void @__cxa_rethrow() #4
+ to label %unreachable unwind label %lpad1
+
+lpad1: ; preds = %lpad
+ %4 = landingpad { i8*, i32 }
+ cleanup
+ invoke void @__cxa_end_catch()
+ to label %eh.resume unwind label %terminate.lpad
+
+try.cont: ; preds = %entry
+ ret void
+
+eh.resume: ; preds = %lpad1
+ resume { i8*, i32 } %4
+
+terminate.lpad: ; preds = %lpad1
+ %5 = landingpad { i8*, i32 }
+ catch i8* null
+ %6 = extractvalue { i8*, i32 } %5, 0
+ tail call void @__clang_call_terminate(i8* %6)
+ unreachable
+
+unreachable: ; preds = %lpad, %if.then
+ unreachable
+}
+
+declare i8* @__cxa_allocate_exception(i64)
+
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_rethrow()
+
+declare void @__cxa_end_catch()
+
+; Function Attrs: noinline noreturn nounwind
+declare void @__clang_call_terminate(i8*)
+
+declare void @_ZSt9terminatev()
+
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly)
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!8, !9, !10}
+
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 6.0.0 (https://github.com/llvm-mirror/clang.git 316ebefb7fff8ad324a08a694347500b6cd7c95f) (https://github.com/llvm-mirror/llvm.git dcae9be81fc17cdfbe989402354d3c8ecd0a2c79)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "throws-cfi-no-fp.cpp", directory: "epilogue-dwarf/test")
+!4 = !{}
+!5 = !{}
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
diff --git a/test/CodeGen/X86/var-permute-128.ll b/test/CodeGen/X86/var-permute-128.ll
index f74343d7f2a..208fab88b58 100644
--- a/test/CodeGen/X86/var-permute-128.ll
+++ b/test/CodeGen/X86/var-permute-128.ll
@@ -143,35 +143,40 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-NEXT: retq
;
-; AVX-LABEL: var_shuffle_v8i16:
-; AVX: # BB#0:
-; AVX-NEXT: vmovd %xmm1, %eax
-; AVX-NEXT: vpextrw $1, %xmm1, %r10d
-; AVX-NEXT: vpextrw $2, %xmm1, %ecx
-; AVX-NEXT: vpextrw $3, %xmm1, %edx
-; AVX-NEXT: vpextrw $4, %xmm1, %esi
-; AVX-NEXT: vpextrw $5, %xmm1, %edi
-; AVX-NEXT: vpextrw $6, %xmm1, %r8d
-; AVX-NEXT: vpextrw $7, %xmm1, %r9d
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: andl $7, %eax
-; AVX-NEXT: andl $7, %r10d
-; AVX-NEXT: andl $7, %ecx
-; AVX-NEXT: andl $7, %edx
-; AVX-NEXT: andl $7, %esi
-; AVX-NEXT: andl $7, %edi
-; AVX-NEXT: andl $7, %r8d
-; AVX-NEXT: andl $7, %r9d
-; AVX-NEXT: movzwl -24(%rsp,%rax,2), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpinsrw $1, -24(%rsp,%r10,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $2, -24(%rsp,%rcx,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $3, -24(%rsp,%rdx,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $4, -24(%rsp,%rsi,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $5, -24(%rsp,%rdi,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $6, -24(%rsp,%r8,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $7, -24(%rsp,%r9,2), %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVXNOVLBW-LABEL: var_shuffle_v8i16:
+; AVXNOVLBW: # BB#0:
+; AVXNOVLBW-NEXT: vmovd %xmm1, %eax
+; AVXNOVLBW-NEXT: vpextrw $1, %xmm1, %r10d
+; AVXNOVLBW-NEXT: vpextrw $2, %xmm1, %ecx
+; AVXNOVLBW-NEXT: vpextrw $3, %xmm1, %edx
+; AVXNOVLBW-NEXT: vpextrw $4, %xmm1, %esi
+; AVXNOVLBW-NEXT: vpextrw $5, %xmm1, %edi
+; AVXNOVLBW-NEXT: vpextrw $6, %xmm1, %r8d
+; AVXNOVLBW-NEXT: vpextrw $7, %xmm1, %r9d
+; AVXNOVLBW-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVXNOVLBW-NEXT: andl $7, %eax
+; AVXNOVLBW-NEXT: andl $7, %r10d
+; AVXNOVLBW-NEXT: andl $7, %ecx
+; AVXNOVLBW-NEXT: andl $7, %edx
+; AVXNOVLBW-NEXT: andl $7, %esi
+; AVXNOVLBW-NEXT: andl $7, %edi
+; AVXNOVLBW-NEXT: andl $7, %r8d
+; AVXNOVLBW-NEXT: andl $7, %r9d
+; AVXNOVLBW-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVXNOVLBW-NEXT: vmovd %eax, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $1, -24(%rsp,%r10,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $2, -24(%rsp,%rcx,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $3, -24(%rsp,%rdx,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $4, -24(%rsp,%rsi,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $5, -24(%rsp,%rdi,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $6, -24(%rsp,%r8,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $7, -24(%rsp,%r9,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v8i16:
+; AVX512VLBW: # BB#0:
+; AVX512VLBW-NEXT: vpermw %xmm0, %xmm1, %xmm0
+; AVX512VLBW-NEXT: retq
%index0 = extractelement <8 x i16> %indices, i32 0
%index1 = extractelement <8 x i16> %indices, i32 1
%index2 = extractelement <8 x i16> %indices, i32 2
@@ -202,143 +207,13 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
; SSSE3-LABEL: var_shuffle_v16i8:
; SSSE3: # BB#0:
-; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm8
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm15
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm9
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm10
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm7
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm11
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm6
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm12
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm5
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm13
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm4
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm14
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: var_shuffle_v16i8:
; AVX: # BB#0:
-; AVX-NEXT: vpextrb $0, %xmm1, %eax
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movzbl (%rax,%rcx), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpextrb $1, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $1, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $2, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $2, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $3, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $3, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $4, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $4, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $5, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $5, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $6, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $6, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $7, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $7, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $8, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $8, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $9, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $9, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $10, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $10, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $11, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $11, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $12, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $12, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $13, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $13, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $14, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $14, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $15, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $15, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%index0 = extractelement <16 x i8> %indices, i32 0
%index1 = extractelement <16 x i8> %indices, i32 1
diff --git a/test/CodeGen/X86/var-permute-256.ll b/test/CodeGen/X86/var-permute-256.ll
index dff145314ea..beef4643c13 100644
--- a/test/CodeGen/X86/var-permute-256.ll
+++ b/test/CodeGen/X86/var-permute-256.ll
@@ -34,32 +34,69 @@ define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
-; INT256-LABEL: var_shuffle_v4i64:
-; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vmovq %xmm1, %rax
-; INT256-NEXT: andl $3, %eax
-; INT256-NEXT: vpextrq $1, %xmm1, %rcx
-; INT256-NEXT: andl $3, %ecx
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1
-; INT256-NEXT: vmovq %xmm1, %rdx
-; INT256-NEXT: andl $3, %edx
-; INT256-NEXT: vpextrq $1, %xmm1, %rsi
-; INT256-NEXT: andl $3, %esi
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; INT256-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; INT256-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; INT256-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; INT256-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; INT256-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; INT256-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
-; INT256-NEXT: retq
+; AVX2-LABEL: var_shuffle_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: andl $3, %eax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: andl $3, %ecx
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vmovq %xmm1, %rdx
+; AVX2-NEXT: andl $3, %edx
+; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT: andl $3, %esi
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v4i64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: andl $3, %eax
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512F-NEXT: andl $3, %ecx
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vmovq %xmm1, %rdx
+; AVX512F-NEXT: andl $3, %edx
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512F-NEXT: andl $3, %esi
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v4i64:
+; AVX512VLBW: # BB#0:
+; AVX512VLBW-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT: retq
%index0 = extractelement <4 x i64> %indices, i32 0
%index1 = extractelement <4 x i64> %indices, i32 1
%index2 = extractelement <4 x i64> %indices, i32 2
@@ -120,44 +157,7 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
;
; INT256-LABEL: var_shuffle_v8i32:
; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vpextrq $1, %xmm1, %r8
-; INT256-NEXT: movq %r8, %rcx
-; INT256-NEXT: shrq $30, %rcx
-; INT256-NEXT: vmovq %xmm1, %r9
-; INT256-NEXT: movq %r9, %rsi
-; INT256-NEXT: shrq $30, %rsi
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1
-; INT256-NEXT: vpextrq $1, %xmm1, %r10
-; INT256-NEXT: movq %r10, %rdi
-; INT256-NEXT: shrq $30, %rdi
-; INT256-NEXT: vmovq %xmm1, %rax
-; INT256-NEXT: movq %rax, %rdx
-; INT256-NEXT: shrq $30, %rdx
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: andl $7, %r9d
-; INT256-NEXT: andl $28, %esi
-; INT256-NEXT: andl $7, %r8d
-; INT256-NEXT: andl $28, %ecx
-; INT256-NEXT: andl $7, %eax
-; INT256-NEXT: andl $28, %edx
-; INT256-NEXT: andl $7, %r10d
-; INT256-NEXT: andl $28, %edi
-; INT256-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; INT256-NEXT: movq %rsp, %rax
-; INT256-NEXT: vpinsrd $1, (%rdx,%rax), %xmm0, %xmm0
-; INT256-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm0, %xmm0
-; INT256-NEXT: vpinsrd $3, (%rdi,%rax), %xmm0, %xmm0
-; INT256-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; INT256-NEXT: vpinsrd $1, (%rsi,%rax), %xmm1, %xmm1
-; INT256-NEXT: vpinsrd $2, (%rsp,%r8,4), %xmm1, %xmm1
-; INT256-NEXT: vpinsrd $3, (%rcx,%rax), %xmm1, %xmm1
-; INT256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
+; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
; INT256-NEXT: retq
%index0 = extractelement <8 x i32> %indices, i32 0
%index1 = extractelement <8 x i32> %indices, i32 1
@@ -250,68 +250,199 @@ define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwi
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
-; INT256-LABEL: var_shuffle_v16i16:
-; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm2
-; INT256-NEXT: vmovd %xmm2, %eax
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: movzwl (%rsp,%rax,2), %eax
-; INT256-NEXT: vmovd %eax, %xmm0
-; INT256-NEXT: vpextrw $1, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $2, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $3, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $4, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $5, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $6, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $7, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vmovd %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: movzwl (%rsp,%rax,2), %eax
-; INT256-NEXT: vmovd %eax, %xmm2
-; INT256-NEXT: vpextrw $1, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $2, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $3, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $4, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $5, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $6, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $7, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
-; INT256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
-; INT256-NEXT: retq
+; AVX2-LABEL: var_shuffle_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpextrw $1, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $2, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $3, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $4, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $5, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $6, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $7, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vpextrw $1, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $2, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $3, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $4, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $5, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $6, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $7, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v16i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vpextrw $1, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $2, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $3, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $4, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $5, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $6, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $7, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v16i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $64, %rsp
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovd %xmm2, %eax
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm0
+; AVX512VL-NEXT: vpextrw $1, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $2, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $3, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $4, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $5, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $6, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $7, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vmovd %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm2
+; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: movq %rbp, %rsp
+; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v16i16:
+; AVX512VLBW: # BB#0:
+; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT: retq
%index0 = extractelement <16 x i16> %indices, i32 0
%index1 = extractelement <16 x i16> %indices, i32 1
%index2 = extractelement <16 x i16> %indices, i32 2
@@ -492,133 +623,394 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
-; INT256-LABEL: var_shuffle_v32i8:
-; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm2
-; INT256-NEXT: vpextrb $0, %xmm2, %eax
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movq %rsp, %rcx
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vmovd %eax, %xmm0
-; INT256-NEXT: vpextrb $1, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $2, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $3, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $4, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $5, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $6, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $7, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $8, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $9, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $10, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $11, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $12, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $13, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $14, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $15, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $0, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vmovd %eax, %xmm2
-; INT256-NEXT: vpextrb $1, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $2, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $3, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $4, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $5, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $6, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $7, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $8, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $9, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $10, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $11, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $12, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $13, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $14, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $15, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; INT256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
-; INT256-NEXT: retq
+; AVX2-LABEL: var_shuffle_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movq %rsp, %rcx
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v32i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movq %rsp, %rcx
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vpextrb $1, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $2, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $3, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $4, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $5, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $6, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $7, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $8, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $9, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $10, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $11, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $12, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $13, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $14, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $15, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v32i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $64, %rsp
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movq %rsp, %rcx
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm0
+; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm2
+; AVX512VL-NEXT: vpextrb $1, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $2, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $3, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $4, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $5, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $6, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $7, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $8, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $9, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $10, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $11, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $12, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $13, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $14, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $15, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: movq %rbp, %rsp
+; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: retq
+;
+; VBMI-LABEL: var_shuffle_v32i8:
+; VBMI: # BB#0:
+; VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; VBMI-NEXT: retq
%index0 = extractelement <32 x i8> %indices, i32 0
%index1 = extractelement <32 x i8> %indices, i32 1
%index2 = extractelement <32 x i8> %indices, i32 2
@@ -744,30 +1136,65 @@ define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) noun
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
-; INT256-LABEL: var_shuffle_v4f64:
-; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vmovq %xmm1, %rax
-; INT256-NEXT: andl $3, %eax
-; INT256-NEXT: vpextrq $1, %xmm1, %rcx
-; INT256-NEXT: andl $3, %ecx
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1
-; INT256-NEXT: vmovq %xmm1, %rdx
-; INT256-NEXT: andl $3, %edx
-; INT256-NEXT: vpextrq $1, %xmm1, %rsi
-; INT256-NEXT: andl $3, %esi
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; INT256-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; INT256-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; INT256-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; INT256-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
-; INT256-NEXT: retq
+; AVX2-LABEL: var_shuffle_v4f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: andl $3, %eax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: andl $3, %ecx
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vmovq %xmm1, %rdx
+; AVX2-NEXT: andl $3, %edx
+; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT: andl $3, %esi
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v4f64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: andl $3, %eax
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512F-NEXT: andl $3, %ecx
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vmovq %xmm1, %rdx
+; AVX512F-NEXT: andl $3, %edx
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512F-NEXT: andl $3, %esi
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v4f64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v4f64:
+; AVX512VLBW: # BB#0:
+; AVX512VLBW-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT: retq
%index0 = extractelement <4 x i64> %indices, i32 0
%index1 = extractelement <4 x i64> %indices, i32 1
%index2 = extractelement <4 x i64> %indices, i32 2
@@ -828,44 +1255,7 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi
;
; INT256-LABEL: var_shuffle_v8f32:
; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vpextrq $1, %xmm1, %r8
-; INT256-NEXT: movq %r8, %rcx
-; INT256-NEXT: shrq $30, %rcx
-; INT256-NEXT: vmovq %xmm1, %r9
-; INT256-NEXT: movq %r9, %rdx
-; INT256-NEXT: shrq $30, %rdx
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1
-; INT256-NEXT: vpextrq $1, %xmm1, %r10
-; INT256-NEXT: movq %r10, %rdi
-; INT256-NEXT: shrq $30, %rdi
-; INT256-NEXT: vmovq %xmm1, %rax
-; INT256-NEXT: movq %rax, %rsi
-; INT256-NEXT: shrq $30, %rsi
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: andl $7, %r9d
-; INT256-NEXT: andl $28, %edx
-; INT256-NEXT: andl $7, %r8d
-; INT256-NEXT: andl $28, %ecx
-; INT256-NEXT: andl $7, %eax
-; INT256-NEXT: andl $28, %esi
-; INT256-NEXT: andl $7, %r10d
-; INT256-NEXT: andl $28, %edi
-; INT256-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; INT256-NEXT: movq %rsp, %rax
-; INT256-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; INT256-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; INT256-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; INT256-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; INT256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; INT256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; INT256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
-; INT256-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
+; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
; INT256-NEXT: retq
%index0 = extractelement <8 x i32> %indices, i32 0
%index1 = extractelement <8 x i32> %indices, i32 1
diff --git a/test/CodeGen/X86/var-permute-512.ll b/test/CodeGen/X86/var-permute-512.ll
index bd1f220ceb1..15c7a1c8b8b 100644
--- a/test/CodeGen/X86/var-permute-512.ll
+++ b/test/CodeGen/X86/var-permute-512.ll
@@ -6,47 +6,7 @@
define <8 x i64> @var_shuffle_v8i64(<8 x i64> %v, <8 x i64> %indices) nounwind {
; AVX512-LABEL: var_shuffle_v8i64:
; AVX512: # BB#0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vmovq %xmm1, %r8
-; AVX512-NEXT: andl $7, %r8d
-; AVX512-NEXT: vpextrq $1, %xmm1, %r9
-; AVX512-NEXT: andl $7, %r9d
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %r10
-; AVX512-NEXT: andl $7, %r10d
-; AVX512-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX512-NEXT: andl $7, %esi
-; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %rdi
-; AVX512-NEXT: andl $7, %edi
-; AVX512-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512-NEXT: andl $7, %eax
-; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512-NEXT: vmovq %xmm1, %rcx
-; AVX512-NEXT: andl $7, %ecx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512-NEXT: andl $7, %edx
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%index0 = extractelement <8 x i64> %indices, i32 0
%index1 = extractelement <8 x i64> %indices, i32 1
@@ -78,76 +38,7 @@ define <8 x i64> @var_shuffle_v8i64(<8 x i64> %v, <8 x i64> %indices) nounwind {
define <16 x i32> @var_shuffle_v16i32(<16 x i32> %v, <16 x i32> %indices) nounwind {
; AVX512-LABEL: var_shuffle_v16i32:
; AVX512: # BB#0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512-NEXT: vpextrq $1, %xmm4, %rax
-; AVX512-NEXT: vmovq %xmm4, %rdx
-; AVX512-NEXT: movq %rdx, %rcx
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: andl $15, %edx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT: movq %rsp, %rdx
-; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm0, %xmm0
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm3, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm0, %xmm0
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm4, %xmm3
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm3, %xmm3
-; AVX512-NEXT: vmovq %xmm2, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm3, %xmm3
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm4, %xmm2
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm2, %xmm2
-; AVX512-NEXT: vmovq %xmm1, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm2, %xmm2
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm4, %xmm1
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm1, %xmm1
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm1, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%index0 = extractelement <16 x i32> %indices, i32 0
%index1 = extractelement <16 x i32> %indices, i32 1
@@ -381,136 +272,7 @@ define <32 x i16> @var_shuffle_v32i16(<32 x i16> %v, <32 x i16> %indices) nounwi
;
; AVX512BW-LABEL: var_shuffle_v32i16:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512BW-NEXT: vmovd %xmm4, %eax
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm0
-; AVX512BW-NEXT: vpextrw $1, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $2, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $3, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $4, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $5, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $6, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $7, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm4
-; AVX512BW-NEXT: vpextrw $1, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $2, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $3, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $4, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $5, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $6, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm4, %xmm3
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm4
-; AVX512BW-NEXT: vpextrw $1, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $2, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $3, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $4, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $5, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $6, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT: vmovd %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm4
-; AVX512BW-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm4, %xmm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
+; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
%index0 = extractelement <32 x i16> %indices, i32 0
%index1 = extractelement <32 x i16> %indices, i32 1
@@ -1014,267 +776,10 @@ define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind {
; NOBW-NEXT: popq %rbp
; NOBW-NEXT: retq
;
-; AVX512BW-LABEL: var_shuffle_v64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movq %rsp, %rsi
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vmovd %edx, %xmm0
-; AVX512BW-NEXT: vpextrb $1, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $2, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $6, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $7, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $9, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $10, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $11, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $15, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vmovd %edx, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $2, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $3, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $4, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $5, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $12, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $13, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $15, %edx, %xmm4, %xmm3
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vmovd %edx, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $2, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $3, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $4, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $5, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $12, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $13, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $15, %edx, %xmm4, %xmm2
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: movzbl (%rcx,%rsi), %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm4
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: vpinsrb $1, (%rax,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: vpinsrb $2, (%rdx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: vpinsrb $3, (%rcx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: vpinsrb $4, (%rax,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: vpinsrb $5, (%rdx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: vpinsrb $6, (%rcx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: vpinsrb $7, (%rax,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: vpinsrb $8, (%rdx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: vpinsrb $9, (%rcx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: vpinsrb $10, (%rax,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: vpinsrb $11, (%rdx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: vpinsrb $12, (%rcx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: movzbl (%rcx,%rsi), %ecx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: movzbl (%rax,%rsi), %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm1
-; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1
-; AVX512BW-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: retq
+; VBMI-LABEL: var_shuffle_v64i8:
+; VBMI: # BB#0:
+; VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; VBMI-NEXT: retq
%index0 = extractelement <64 x i8> %indices, i32 0
%index1 = extractelement <64 x i8> %indices, i32 1
%index2 = extractelement <64 x i8> %indices, i32 2
@@ -1473,43 +978,7 @@ define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind {
define <8 x double> @var_shuffle_v8f64(<8 x double> %v, <8 x i64> %indices) nounwind {
; AVX512-LABEL: var_shuffle_v8f64:
; AVX512: # BB#0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vmovq %xmm1, %r8
-; AVX512-NEXT: andl $7, %r8d
-; AVX512-NEXT: vpextrq $1, %xmm1, %r9
-; AVX512-NEXT: andl $7, %r9d
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %r10
-; AVX512-NEXT: andl $7, %r10d
-; AVX512-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX512-NEXT: andl $7, %esi
-; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %rdi
-; AVX512-NEXT: andl $7, %edi
-; AVX512-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512-NEXT: andl $7, %eax
-; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512-NEXT: vmovq %xmm1, %rcx
-; AVX512-NEXT: andl $7, %ecx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512-NEXT: andl $7, %edx
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%index0 = extractelement <8 x i64> %indices, i32 0
%index1 = extractelement <8 x i64> %indices, i32 1
@@ -1541,76 +1010,7 @@ define <8 x double> @var_shuffle_v8f64(<8 x double> %v, <8 x i64> %indices) noun
define <16 x float> @var_shuffle_v16f32(<16 x float> %v, <16 x i32> %indices) nounwind {
; AVX512-LABEL: var_shuffle_v16f32:
; AVX512: # BB#0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512-NEXT: vpextrq $1, %xmm4, %rax
-; AVX512-NEXT: vmovq %xmm4, %rdx
-; AVX512-NEXT: movq %rdx, %rcx
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: andl $15, %edx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT: movq %rsp, %rdx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX512-NEXT: vmovq %xmm3, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],mem[0],xmm4[2,3]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
-; AVX512-NEXT: vmovq %xmm2, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],mem[0],xmm4[2,3]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; AVX512-NEXT: vmovq %xmm1, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],mem[0],xmm4[2,3]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%index0 = extractelement <16 x i32> %indices, i32 0
%index1 = extractelement <16 x i32> %indices, i32 1
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index c6335d751ed..2f52bab2803 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -2288,67 +2288,19 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
; VEX-NEXT: popq %rax
; VEX-NEXT: retq
;
-; AVX512F-LABEL: fptosi_2f16_to_4i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: vcvttss2si %xmm1, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vcvttss2si %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_2f16_to_4i32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_2f16_to_4i32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512DQ-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512DQ-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512DQ-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512DQ-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512DQ-NEXT: vcvttss2si %xmm1, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm1
-; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_2f16_to_4i32:
-; AVX512VLDQ: # BB#0:
-; AVX512VLDQ-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VLDQ-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VLDQ-NEXT: vcvttss2si %xmm1, %rax
-; AVX512VLDQ-NEXT: vmovq %rax, %xmm1
-; AVX512VLDQ-NEXT: vcvttss2si %xmm0, %rax
-; AVX512VLDQ-NEXT: vmovq %rax, %xmm0
-; AVX512VLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512VLDQ-NEXT: retq
+; AVX512-LABEL: fptosi_2f16_to_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vcvttss2si %xmm1, %rax
+; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vcvttss2si %xmm0, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX512-NEXT: retq
%cvt = fptosi <2 x half> %a to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i32> %ext
diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll
index 6e664ba98d9..9feff88a576 100644
--- a/test/CodeGen/X86/vector-half-conversions.ll
+++ b/test/CodeGen/X86/vector-half-conversions.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
;
@@ -9,35 +9,12 @@
;
define float @cvt_i16_to_f32(i16 %a0) nounwind {
-; AVX1-LABEL: cvt_i16_to_f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl %di, %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_i16_to_f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl %di, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_i16_to_f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_i16_to_f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_i16_to_f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = bitcast i16 %a0 to half
%2 = fpext half %1 to float
ret float %2
@@ -111,19 +88,18 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
; AVX512F-NEXT: shrq $48, %rdx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4i16_to_4f32:
@@ -222,19 +198,18 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
; AVX512F-NEXT: shrq $48, %rdx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8i16_to_4f32:
@@ -271,201 +246,54 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
}
define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
-; AVX1-LABEL: cvt_8i16_to_8f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX1-NEXT: movq %rdx, %r8
-; AVX1-NEXT: movq %rdx, %r10
-; AVX1-NEXT: movswl %dx, %r9d
-; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX1-NEXT: shrl $16, %edx
-; AVX1-NEXT: shrq $32, %r8
-; AVX1-NEXT: shrq $48, %r10
-; AVX1-NEXT: vmovq %xmm0, %rdi
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: movq %rdi, %rsi
-; AVX1-NEXT: movswl %di, %ecx
-; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX1-NEXT: shrl $16, %edi
-; AVX1-NEXT: shrq $32, %rax
-; AVX1-NEXT: shrq $48, %rsi
-; AVX1-NEXT: movswl %si, %esi
-; AVX1-NEXT: vmovd %esi, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: cwtl
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl %di, %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: vmovd %ecx, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: movswl %r10w, %eax
-; AVX1-NEXT: vmovd %eax, %xmm4
-; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX1-NEXT: movswl %r8w, %eax
-; AVX1-NEXT: vmovd %eax, %xmm5
-; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT: movswl %dx, %eax
-; AVX1-NEXT: vmovd %eax, %xmm6
-; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX1-NEXT: vmovd %r9d, %xmm7
-; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_8i16_to_8f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: movq %rdx, %r10
-; AVX2-NEXT: movswl %dx, %r9d
-; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX2-NEXT: shrl $16, %edx
-; AVX2-NEXT: shrq $32, %r8
-; AVX2-NEXT: shrq $48, %r10
-; AVX2-NEXT: vmovq %xmm0, %rdi
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: movq %rdi, %rsi
-; AVX2-NEXT: movswl %di, %ecx
-; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX2-NEXT: shrl $16, %edi
-; AVX2-NEXT: shrq $32, %rax
-; AVX2-NEXT: shrq $48, %rsi
-; AVX2-NEXT: movswl %si, %esi
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: cwtl
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl %di, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: vmovd %ecx, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: movswl %r10w, %eax
-; AVX2-NEXT: vmovd %eax, %xmm4
-; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX2-NEXT: movswl %r8w, %eax
-; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT: movswl %dx, %eax
-; AVX2-NEXT: vmovd %eax, %xmm6
-; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX2-NEXT: vmovd %r9d, %xmm7
-; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_8i16_to_8f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT: movq %rdx, %r8
-; AVX512F-NEXT: movq %rdx, %r9
-; AVX512F-NEXT: movswl %dx, %r10d
-; AVX512F-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX512F-NEXT: shrl $16, %edx
-; AVX512F-NEXT: shrq $32, %r8
-; AVX512F-NEXT: shrq $48, %r9
-; AVX512F-NEXT: vmovq %xmm0, %rdi
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq %rdi, %rcx
-; AVX512F-NEXT: movswl %di, %esi
-; AVX512F-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX512F-NEXT: shrl $16, %edi
-; AVX512F-NEXT: shrq $32, %rax
-; AVX512F-NEXT: shrq $48, %rcx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl %r9w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: movswl %r8w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl %dx, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: vmovd %r10d, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_8i16_to_8f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512VL-NEXT: movq %rdx, %r8
-; AVX512VL-NEXT: movq %rdx, %r10
-; AVX512VL-NEXT: movswl %dx, %r9d
-; AVX512VL-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX512VL-NEXT: shrl $16, %edx
-; AVX512VL-NEXT: shrq $32, %r8
-; AVX512VL-NEXT: shrq $48, %r10
-; AVX512VL-NEXT: vmovq %xmm0, %rdi
-; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq %rdi, %rsi
-; AVX512VL-NEXT: movswl %di, %ecx
-; AVX512VL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX512VL-NEXT: shrl $16, %edi
-; AVX512VL-NEXT: shrq $32, %rax
-; AVX512VL-NEXT: shrq $48, %rsi
-; AVX512VL-NEXT: movswl %si, %esi
-; AVX512VL-NEXT: vmovd %esi, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: cwtl
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %ecx, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl %r10w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: movswl %r8w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl %dx, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: vmovd %r9d, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_8i16_to_8f32:
+; ALL: # BB#0:
+; ALL-NEXT: vpextrq $1, %xmm0, %rdx
+; ALL-NEXT: movq %rdx, %r8
+; ALL-NEXT: movq %rdx, %r10
+; ALL-NEXT: movswl %dx, %r9d
+; ALL-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
+; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: shrq $32, %r8
+; ALL-NEXT: shrq $48, %r10
+; ALL-NEXT: vmovq %xmm0, %rdi
+; ALL-NEXT: movq %rdi, %rax
+; ALL-NEXT: movq %rdi, %rsi
+; ALL-NEXT: movswl %di, %ecx
+; ALL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
+; ALL-NEXT: shrl $16, %edi
+; ALL-NEXT: shrq $32, %rax
+; ALL-NEXT: shrq $48, %rsi
+; ALL-NEXT: movswl %si, %esi
+; ALL-NEXT: vmovd %esi, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vmovd %ecx, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: movswl %r10w, %eax
+; ALL-NEXT: vmovd %eax, %xmm4
+; ALL-NEXT: vcvtph2ps %xmm4, %xmm4
+; ALL-NEXT: movswl %r8w, %eax
+; ALL-NEXT: vmovd %eax, %xmm5
+; ALL-NEXT: vcvtph2ps %xmm5, %xmm5
+; ALL-NEXT: movswl %dx, %eax
+; ALL-NEXT: vmovd %eax, %xmm6
+; ALL-NEXT: vcvtph2ps %xmm6, %xmm6
+; ALL-NEXT: vmovd %r9d, %xmm7
+; ALL-NEXT: vcvtph2ps %xmm7, %xmm7
+; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; ALL-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
%2 = fpext <8 x half> %1 to <8 x float>
ret <8 x float> %2
@@ -664,98 +492,98 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
;
; AVX512F-LABEL: cvt_16i16_to_16f32:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm10
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm2
+; AVX512F-NEXT: vmovd %ecx, %xmm8
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm3
+; AVX512F-NEXT: vmovd %ecx, %xmm9
; AVX512F-NEXT: movswl %ax, %ecx
; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm4
+; AVX512F-NEXT: vmovd %eax, %xmm11
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vmovd %ecx, %xmm0
+; AVX512F-NEXT: vmovd %ecx, %xmm12
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm5
+; AVX512F-NEXT: vmovd %ecx, %xmm13
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm6
+; AVX512F-NEXT: vmovd %ecx, %xmm14
; AVX512F-NEXT: movswl %ax, %ecx
; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vmovq %xmm1, %rax
-; AVX512F-NEXT: vmovd %ecx, %xmm8
+; AVX512F-NEXT: vmovd %eax, %xmm15
+; AVX512F-NEXT: vmovq %xmm10, %rax
+; AVX512F-NEXT: vmovd %ecx, %xmm2
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm9
+; AVX512F-NEXT: vmovd %ecx, %xmm3
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm10
+; AVX512F-NEXT: vmovd %ecx, %xmm1
; AVX512F-NEXT: movswl %ax, %ecx
; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm11
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT: vmovd %ecx, %xmm1
+; AVX512F-NEXT: vmovd %eax, %xmm4
+; AVX512F-NEXT: vpextrq $1, %xmm10, %rax
+; AVX512F-NEXT: vmovd %ecx, %xmm10
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm12
+; AVX512F-NEXT: vmovd %ecx, %xmm5
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm13
+; AVX512F-NEXT: vmovd %ecx, %xmm6
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: shrl $16, %ecx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm14
+; AVX512F-NEXT: vmovd %ecx, %xmm7
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm15
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm16
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vcvtph2ps %ymm8, %zmm8
-; AVX512F-NEXT: vcvtph2ps %ymm9, %zmm9
-; AVX512F-NEXT: vcvtph2ps %ymm10, %zmm10
-; AVX512F-NEXT: vcvtph2ps %ymm11, %zmm11
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: vcvtph2ps %ymm12, %zmm12
-; AVX512F-NEXT: vcvtph2ps %ymm13, %zmm13
-; AVX512F-NEXT: vcvtph2ps %ymm14, %zmm14
-; AVX512F-NEXT: vcvtph2ps %ymm15, %zmm15
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm13[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm10[0],xmm1[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm9[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm6[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm16[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm8
+; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm9
+; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm11
+; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm12
+; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm13
+; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm14
+; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm15
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512F-NEXT: vcvtph2ps %xmm10, %xmm10
+; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_16i16_to_16f32:
@@ -863,35 +691,12 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
;
define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
-; AVX1-LABEL: load_cvt_i16_to_f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_i16_to_f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_i16_to_f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_i16_to_f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_i16_to_f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = load i16, i16* %a0
%2 = bitcast i16 %1 to half
%3 = fpext half %2 to float
@@ -899,82 +704,24 @@ define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
}
define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_4i16_to_4f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl 6(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 4(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_4i16_to_4f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl 6(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 4(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_4i16_to_4f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_4i16_to_4f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_4i16_to_4f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: retq
%1 = load <4 x i16>, <4 x i16>* %a0
%2 = bitcast <4 x i16> %1 to <4 x half>
%3 = fpext <4 x half> %2 to <4 x float>
@@ -1046,19 +793,18 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
; AVX512F-NEXT: shrq $48, %rdx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_8i16_to_4f32:
@@ -1096,145 +842,40 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
}
define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_8i16_to_8f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl 6(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 4(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: movswl 14(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm4
-; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX1-NEXT: movswl 12(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm5
-; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT: movswl 8(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm6
-; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX1-NEXT: movswl 10(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm7
-; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_8i16_to_8f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl 6(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 4(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: movswl 14(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm4
-; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX2-NEXT: movswl 12(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT: movswl 8(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm6
-; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX2-NEXT: movswl 10(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm7
-; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_8i16_to_8f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl 14(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: movswl 12(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl 8(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: movswl 10(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_8i16_to_8f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl 14(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: movswl 12(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl 8(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: movswl 10(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_8i16_to_8f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: movswl 14(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm4
+; ALL-NEXT: vcvtph2ps %xmm4, %xmm4
+; ALL-NEXT: movswl 12(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm5
+; ALL-NEXT: vcvtph2ps %xmm5, %xmm5
+; ALL-NEXT: movswl 8(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm6
+; ALL-NEXT: vcvtph2ps %xmm6, %xmm6
+; ALL-NEXT: movswl 10(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm7
+; ALL-NEXT: vcvtph2ps %xmm7, %xmm7
+; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; ALL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = bitcast <8 x i16> %1 to <8 x half>
%3 = fpext <8 x half> %2 to <8 x float>
@@ -1378,65 +1019,65 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
; AVX512F: # BB#0:
; AVX512F-NEXT: movswl 6(%rdi), %eax
; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm16
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm8
; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm17
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm9
; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm10
; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm11
; AVX512F-NEXT: movswl 14(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm12
; AVX512F-NEXT: movswl 12(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm13
; AVX512F-NEXT: movswl 8(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm14
; AVX512F-NEXT: movswl 10(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm15
; AVX512F-NEXT: movswl 22(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm8
-; AVX512F-NEXT: vcvtph2ps %ymm8, %zmm8
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl 20(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm9
-; AVX512F-NEXT: vcvtph2ps %ymm9, %zmm9
+; AVX512F-NEXT: vmovd %eax, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl 16(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm10
-; AVX512F-NEXT: vcvtph2ps %ymm10, %zmm10
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: movswl 18(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm11
-; AVX512F-NEXT: vcvtph2ps %ymm11, %zmm11
+; AVX512F-NEXT: vmovd %eax, %xmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: movswl 30(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm12
-; AVX512F-NEXT: vcvtph2ps %ymm12, %zmm12
+; AVX512F-NEXT: vmovd %eax, %xmm4
+; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4
; AVX512F-NEXT: movswl 28(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm13
-; AVX512F-NEXT: vcvtph2ps %ymm13, %zmm13
+; AVX512F-NEXT: vmovd %eax, %xmm5
+; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5
; AVX512F-NEXT: movswl 24(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm14
-; AVX512F-NEXT: vcvtph2ps %ymm14, %zmm14
+; AVX512F-NEXT: vmovd %eax, %xmm6
+; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
; AVX512F-NEXT: movswl 26(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm15
-; AVX512F-NEXT: vcvtph2ps %ymm15, %zmm15
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm13[0],xmm0[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm12[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0],xmm1[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; AVX512F-NEXT: vmovd %eax, %xmm7
+; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm17[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm16[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
@@ -1518,38 +1159,13 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
;
define double @cvt_i16_to_f64(i16 %a0) nounwind {
-; AVX1-LABEL: cvt_i16_to_f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl %di, %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_i16_to_f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl %di, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_i16_to_f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_i16_to_f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_i16_to_f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = bitcast i16 %a0 to half
%2 = fpext half %1 to double
ret double %2
@@ -1599,13 +1215,12 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_2i16_to_2f64:
@@ -1701,15 +1316,15 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
@@ -1791,13 +1406,12 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8i16_to_2f64:
@@ -1892,15 +1506,15 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
@@ -1950,25 +1564,25 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_8i16_to_8f64:
; AVX1: # BB#0:
; AVX1-NEXT: vmovq %xmm0, %rdx
-; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movq %rdx, %r9
; AVX1-NEXT: movl %edx, %r10d
-; AVX1-NEXT: movswl %dx, %r9d
+; AVX1-NEXT: movswl %dx, %r8d
; AVX1-NEXT: shrq $48, %rdx
-; AVX1-NEXT: shrq $32, %r8
+; AVX1-NEXT: shrq $32, %r9
; AVX1-NEXT: shrl $16, %r10d
; AVX1-NEXT: vpextrq $1, %xmm0, %rdi
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: movl %edi, %esi
+; AVX1-NEXT: movq %rdi, %rsi
+; AVX1-NEXT: movl %edi, %eax
; AVX1-NEXT: movswl %di, %ecx
; AVX1-NEXT: shrq $48, %rdi
-; AVX1-NEXT: shrq $32, %rax
-; AVX1-NEXT: shrl $16, %esi
-; AVX1-NEXT: movswl %si, %esi
-; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: shrq $32, %rsi
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1
; AVX1-NEXT: vmovd %ecx, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX1-NEXT: cwtl
+; AVX1-NEXT: movswl %si, %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3
; AVX1-NEXT: movswl %di, %eax
@@ -1977,9 +1591,9 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX1-NEXT: movswl %r10w, %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: vmovd %r9d, %xmm5
+; AVX1-NEXT: vmovd %r8d, %xmm5
; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT: movswl %r8w, %eax
+; AVX1-NEXT: movswl %r9w, %eax
; AVX1-NEXT: vmovd %eax, %xmm6
; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
; AVX1-NEXT: movswl %dx, %eax
@@ -2004,25 +1618,25 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX2-LABEL: cvt_8i16_to_8f64:
; AVX2: # BB#0:
; AVX2-NEXT: vmovq %xmm0, %rdx
-; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movq %rdx, %r9
; AVX2-NEXT: movl %edx, %r10d
-; AVX2-NEXT: movswl %dx, %r9d
+; AVX2-NEXT: movswl %dx, %r8d
; AVX2-NEXT: shrq $48, %rdx
-; AVX2-NEXT: shrq $32, %r8
+; AVX2-NEXT: shrq $32, %r9
; AVX2-NEXT: shrl $16, %r10d
; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: movl %edi, %esi
+; AVX2-NEXT: movq %rdi, %rsi
+; AVX2-NEXT: movl %edi, %eax
; AVX2-NEXT: movswl %di, %ecx
; AVX2-NEXT: shrq $48, %rdi
-; AVX2-NEXT: shrq $32, %rax
-; AVX2-NEXT: shrl $16, %esi
-; AVX2-NEXT: movswl %si, %esi
-; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: shrq $32, %rsi
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
; AVX2-NEXT: vmovd %ecx, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX2-NEXT: cwtl
+; AVX2-NEXT: movswl %si, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
; AVX2-NEXT: movswl %di, %eax
@@ -2031,9 +1645,9 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX2-NEXT: movswl %r10w, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vmovd %r9d, %xmm5
+; AVX2-NEXT: vmovd %r8d, %xmm5
; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT: movswl %r8w, %eax
+; AVX2-NEXT: movswl %r9w, %eax
; AVX2-NEXT: vmovd %eax, %xmm6
; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
; AVX2-NEXT: movswl %dx, %eax
@@ -2055,115 +1669,60 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: cvt_8i16_to_8f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT: movq %rdx, %r8
-; AVX512F-NEXT: movl %edx, %r9d
-; AVX512F-NEXT: movswl %dx, %r10d
-; AVX512F-NEXT: shrq $48, %rdx
-; AVX512F-NEXT: shrq $32, %r8
-; AVX512F-NEXT: shrl $16, %r9d
-; AVX512F-NEXT: vmovq %xmm0, %rdi
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movl %edi, %ecx
-; AVX512F-NEXT: movswl %di, %esi
-; AVX512F-NEXT: shrq $48, %rdi
-; AVX512F-NEXT: shrq $32, %rax
-; AVX512F-NEXT: shrl $16, %ecx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl %r9w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: vmovd %r10d, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl %r8w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: movswl %dx, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512F-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_8i16_to_8f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512VL-NEXT: movq %rdx, %r8
-; AVX512VL-NEXT: movl %edx, %r10d
-; AVX512VL-NEXT: movswl %dx, %r9d
-; AVX512VL-NEXT: shrq $48, %rdx
-; AVX512VL-NEXT: shrq $32, %r8
-; AVX512VL-NEXT: shrl $16, %r10d
-; AVX512VL-NEXT: vmovq %xmm0, %rdi
-; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movl %edi, %esi
-; AVX512VL-NEXT: movswl %di, %ecx
-; AVX512VL-NEXT: shrq $48, %rdi
-; AVX512VL-NEXT: shrq $32, %rax
-; AVX512VL-NEXT: shrl $16, %esi
-; AVX512VL-NEXT: movswl %si, %esi
-; AVX512VL-NEXT: vmovd %esi, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %ecx, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: cwtl
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl %r10w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: vmovd %r9d, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl %r8w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: movswl %dx, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: cvt_8i16_to_8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT: movq %rdx, %r9
+; AVX512-NEXT: movl %edx, %r10d
+; AVX512-NEXT: movswl %dx, %r8d
+; AVX512-NEXT: shrq $48, %rdx
+; AVX512-NEXT: shrq $32, %r9
+; AVX512-NEXT: shrl $16, %r10d
+; AVX512-NEXT: vmovq %xmm0, %rdi
+; AVX512-NEXT: movq %rdi, %rsi
+; AVX512-NEXT: movl %edi, %eax
+; AVX512-NEXT: movswl %di, %ecx
+; AVX512-NEXT: shrq $48, %rdi
+; AVX512-NEXT: shrq $32, %rsi
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovd %ecx, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl %si, %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl %di, %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl %r10w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: vmovd %r8d, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl %r9w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl %dx, %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
%2 = fpext <8 x half> %1 to <8 x double>
ret <8 x double> %2
@@ -2174,38 +1733,13 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
;
define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
-; AVX1-LABEL: load_cvt_i16_to_f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_i16_to_f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_i16_to_f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_i16_to_f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_i16_to_f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = load i16, i16* %a0
%2 = bitcast i16 %1 to half
%3 = fpext half %2 to double
@@ -2213,58 +1747,18 @@ define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
}
define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_2i16_to_2f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_2i16_to_2f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_2i16_to_2f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_2i16_to_2f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_2i16_to_2f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; ALL-NEXT: retq
%1 = load <2 x i16>, <2 x i16>* %a0
%2 = bitcast <2 x i16> %1 to <2 x half>
%3 = fpext <2 x half> %2 to <2 x double>
@@ -2272,97 +1766,28 @@ define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
}
define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_4i16_to_4f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl 4(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: movswl 6(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_4i16_to_4f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl 4(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: movswl 6(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_4i16_to_4f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_4i16_to_4f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_4i16_to_4f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: retq
%1 = load <4 x i16>, <4 x i16>* %a0
%2 = bitcast <4 x i16> %1 to <4 x half>
%3 = fpext <4 x half> %2 to <4 x double>
@@ -2439,15 +1864,15 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
@@ -2579,91 +2004,48 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: load_cvt_8i16_to_8f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl 8(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: movswl 10(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl 12(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: movswl 14(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512F-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_8i16_to_8f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl 8(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: movswl 10(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl 12(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: movswl 14(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: load_cvt_8i16_to_8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: movswl (%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: movswl 2(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl 4(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl 6(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl 8(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: movswl 10(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl 12(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl 14(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = bitcast <8 x i16> %1 to <8 x half>
%3 = fpext <8 x half> %2 to <8 x double>
@@ -2675,138 +2057,41 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
;
define i16 @cvt_f32_to_i16(float %a0) nounwind {
-; AVX1-LABEL: cvt_f32_to_i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_f32_to_i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_f32_to_i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_f32_to_i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_f32_to_i16:
+; ALL: # BB#0:
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ALL-NEXT: retq
%1 = fptrunc float %a0 to half
%2 = bitcast half %1 to i16
ret i16 %2
}
define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind {
-; AVX1-LABEL: cvt_4f32_to_4i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: movzwl %cx, %ecx
-; AVX1-NEXT: orl %eax, %ecx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: movzwl %dx, %edx
-; AVX1-NEXT: orl %eax, %edx
-; AVX1-NEXT: shlq $32, %rdx
-; AVX1-NEXT: orq %rcx, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_4f32_to_4i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: movzwl %cx, %ecx
-; AVX2-NEXT: orl %eax, %ecx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: movzwl %dx, %edx
-; AVX2-NEXT: orl %eax, %edx
-; AVX2-NEXT: shlq $32, %rdx
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_4f32_to_4i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
-; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
-; AVX512F-NEXT: orl %eax, %edx
-; AVX512F-NEXT: shlq $32, %rdx
-; AVX512F-NEXT: orq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_4f32_to_4i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: movzwl %cx, %ecx
-; AVX512VL-NEXT: orl %eax, %ecx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %edx
-; AVX512VL-NEXT: movzwl %dx, %edx
-; AVX512VL-NEXT: orl %eax, %edx
-; AVX512VL-NEXT: shlq $32, %rdx
-; AVX512VL-NEXT: orq %rcx, %rdx
-; AVX512VL-NEXT: vmovq %rdx, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_4f32_to_4i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
ret <4 x i16> %2
@@ -2865,29 +2150,27 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
;
; AVX512F-LABEL: cvt_4f32_to_8i16_undef:
; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_8i16_undef:
@@ -2974,29 +2257,27 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
;
; AVX512F-LABEL: cvt_4f32_to_8i16_zero:
; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
@@ -3033,194 +2314,52 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
}
define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
-; AVX1-LABEL: cvt_8f32_to_8i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: movzwl %cx, %ecx
-; AVX1-NEXT: orl %eax, %ecx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %edx
-; AVX1-NEXT: shll $16, %edx
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: movzwl %ax, %eax
-; AVX1-NEXT: orl %edx, %eax
-; AVX1-NEXT: shlq $32, %rax
-; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: shll $16, %ecx
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %edx
-; AVX1-NEXT: movzwl %dx, %edx
-; AVX1-NEXT: orl %ecx, %edx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: shll $16, %ecx
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %esi
-; AVX1-NEXT: movzwl %si, %esi
-; AVX1-NEXT: orl %ecx, %esi
-; AVX1-NEXT: shlq $32, %rsi
-; AVX1-NEXT: orq %rdx, %rsi
-; AVX1-NEXT: vmovq %rsi, %xmm0
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_8f32_to_8i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: movzwl %cx, %ecx
-; AVX2-NEXT: orl %eax, %ecx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: shll $16, %edx
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: orl %edx, %eax
-; AVX2-NEXT: shlq $32, %rax
-; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: shll $16, %ecx
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: movzwl %dx, %edx
-; AVX2-NEXT: orl %ecx, %edx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: shll $16, %ecx
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %esi
-; AVX2-NEXT: movzwl %si, %esi
-; AVX2-NEXT: orl %ecx, %esi
-; AVX2-NEXT: shlq $32, %rsi
-; AVX2-NEXT: orq %rdx, %rsi
-; AVX2-NEXT: vmovq %rsi, %xmm0
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_8f32_to_8i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
-; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %edx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: shll $16, %eax
-; AVX512F-NEXT: orl %edx, %eax
-; AVX512F-NEXT: shlq $32, %rax
-; AVX512F-NEXT: orq %rcx, %rax
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: movzwl %cx, %ecx
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %edx
-; AVX512F-NEXT: shll $16, %edx
-; AVX512F-NEXT: orl %ecx, %edx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: movzwl %cx, %ecx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %esi
-; AVX512F-NEXT: shll $16, %esi
-; AVX512F-NEXT: orl %ecx, %esi
-; AVX512F-NEXT: shlq $32, %rsi
-; AVX512F-NEXT: orq %rdx, %rsi
-; AVX512F-NEXT: vmovq %rsi, %xmm0
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_8f32_to_8i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: movzwl %cx, %ecx
-; AVX512VL-NEXT: orl %eax, %ecx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %edx
-; AVX512VL-NEXT: shll $16, %edx
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: movzwl %ax, %eax
-; AVX512VL-NEXT: orl %edx, %eax
-; AVX512VL-NEXT: shlq $32, %rax
-; AVX512VL-NEXT: orq %rcx, %rax
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: shll $16, %ecx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %edx
-; AVX512VL-NEXT: movzwl %dx, %edx
-; AVX512VL-NEXT: orl %ecx, %edx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: shll $16, %ecx
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %esi
-; AVX512VL-NEXT: movzwl %si, %esi
-; AVX512VL-NEXT: orl %ecx, %esi
-; AVX512VL-NEXT: shlq $32, %rsi
-; AVX512VL-NEXT: orq %rdx, %rsi
-; AVX512VL-NEXT: vmovq %rsi, %xmm0
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_8f32_to_8i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %edx
+; ALL-NEXT: shll $16, %edx
+; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: movzwl %ax, %eax
+; ALL-NEXT: orl %edx, %eax
+; ALL-NEXT: shlq $32, %rax
+; ALL-NEXT: orq %rcx, %rax
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: shll $16, %ecx
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %ecx, %edx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: shll $16, %ecx
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %esi
+; ALL-NEXT: movzwl %si, %esi
+; ALL-NEXT: orl %ecx, %esi
+; ALL-NEXT: shlq $32, %rsi
+; ALL-NEXT: orq %rdx, %rsi
+; ALL-NEXT: vmovq %rsi, %xmm0
+; ALL-NEXT: vmovq %rax, %xmm1
+; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%1 = fptrunc <8 x float> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
ret <8 x i16> %2
@@ -3361,141 +2500,73 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: cvt_16f32_to_16i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm1
-; AVX512F-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm0
-; AVX512F-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_16f32_to_16i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm1
-; AVX512VL-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX512VL-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
-; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: cvt_16f32_to_16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: retq
%1 = fptrunc <16 x float> %a0 to <16 x half>
%2 = bitcast <16 x half> %1 to <16 x i16>
ret <16 x i16> %2
@@ -3506,35 +2577,12 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
;
define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
-; AVX1-LABEL: store_cvt_f32_to_i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: movw %ax, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: store_cvt_f32_to_i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: movw %ax, (%rdi)
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: store_cvt_f32_to_i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: movw %ax, (%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_f32_to_i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: movw %ax, (%rdi)
-; AVX512VL-NEXT: retq
+; ALL-LABEL: store_cvt_f32_to_i16:
+; ALL: # BB#0:
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: movw %ax, (%rdi)
+; ALL-NEXT: retq
%1 = fptrunc float %a0 to half
%2 = bitcast half %1 to i16
store i16 %2, i16* %a1
@@ -3542,83 +2590,24 @@ define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
}
define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind {
-; AVX1-LABEL: store_cvt_4f32_to_4i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %edx
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %esi
-; AVX1-NEXT: movw %si, (%rdi)
-; AVX1-NEXT: movw %dx, 6(%rdi)
-; AVX1-NEXT: movw %cx, 4(%rdi)
-; AVX1-NEXT: movw %ax, 2(%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: store_cvt_4f32_to_4i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %esi
-; AVX2-NEXT: movw %si, (%rdi)
-; AVX2-NEXT: movw %dx, 6(%rdi)
-; AVX2-NEXT: movw %cx, 4(%rdi)
-; AVX2-NEXT: movw %ax, 2(%rdi)
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: store_cvt_4f32_to_4i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %edx
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %esi
-; AVX512F-NEXT: movw %si, (%rdi)
-; AVX512F-NEXT: movw %dx, 6(%rdi)
-; AVX512F-NEXT: movw %cx, 4(%rdi)
-; AVX512F-NEXT: movw %ax, 2(%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_4f32_to_4i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %edx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %esi
-; AVX512VL-NEXT: movw %si, (%rdi)
-; AVX512VL-NEXT: movw %dx, 6(%rdi)
-; AVX512VL-NEXT: movw %cx, 4(%rdi)
-; AVX512VL-NEXT: movw %ax, 2(%rdi)
-; AVX512VL-NEXT: retq
+; ALL-LABEL: store_cvt_4f32_to_4i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %edx
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %esi
+; ALL-NEXT: movw %si, (%rdi)
+; ALL-NEXT: movw %dx, 6(%rdi)
+; ALL-NEXT: movw %cx, 4(%rdi)
+; ALL-NEXT: movw %ax, 2(%rdi)
+; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
store <4 x i16> %2, <4 x i16>* %a1
@@ -3680,30 +2669,28 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw
;
; AVX512F-LABEL: store_cvt_4f32_to_8i16_undef:
; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f32_to_8i16_undef:
@@ -3794,30 +2781,28 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi
;
; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero:
; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero:
@@ -3856,150 +2841,41 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi
}
define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind {
-; AVX1-LABEL: store_cvt_8f32_to_8i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %r8d
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %r9d
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %r10d
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovd %xmm2, %r11d
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovd %xmm2, %eax
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovd %xmm2, %ecx
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %esi
-; AVX1-NEXT: movw %si, 8(%rdi)
-; AVX1-NEXT: movw %dx, (%rdi)
-; AVX1-NEXT: movw %cx, 14(%rdi)
-; AVX1-NEXT: movw %ax, 12(%rdi)
-; AVX1-NEXT: movw %r11w, 10(%rdi)
-; AVX1-NEXT: movw %r10w, 6(%rdi)
-; AVX1-NEXT: movw %r9w, 4(%rdi)
-; AVX1-NEXT: movw %r8w, 2(%rdi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: store_cvt_8f32_to_8i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %r8d
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %r9d
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %r10d
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT: vmovd %xmm2, %r11d
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT: vmovd %xmm2, %eax
-; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT: vmovd %xmm2, %ecx
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %esi
-; AVX2-NEXT: movw %si, 8(%rdi)
-; AVX2-NEXT: movw %dx, (%rdi)
-; AVX2-NEXT: movw %cx, 14(%rdi)
-; AVX2-NEXT: movw %ax, 12(%rdi)
-; AVX2-NEXT: movw %r11w, 10(%rdi)
-; AVX2-NEXT: movw %r10w, 6(%rdi)
-; AVX2-NEXT: movw %r9w, 4(%rdi)
-; AVX2-NEXT: movw %r8w, 2(%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: store_cvt_8f32_to_8i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %r8d
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %r9d
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %r10d
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %r11d
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %ecx
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %esi
-; AVX512F-NEXT: movw %si, 8(%rdi)
-; AVX512F-NEXT: movw %dx, (%rdi)
-; AVX512F-NEXT: movw %cx, 14(%rdi)
-; AVX512F-NEXT: movw %ax, 12(%rdi)
-; AVX512F-NEXT: movw %r11w, 10(%rdi)
-; AVX512F-NEXT: movw %r10w, 6(%rdi)
-; AVX512F-NEXT: movw %r9w, 4(%rdi)
-; AVX512F-NEXT: movw %r8w, 2(%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_8f32_to_8i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %r8d
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %r9d
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %r10d
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %r11d
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %ecx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %edx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %esi
-; AVX512VL-NEXT: movw %si, 8(%rdi)
-; AVX512VL-NEXT: movw %dx, (%rdi)
-; AVX512VL-NEXT: movw %cx, 14(%rdi)
-; AVX512VL-NEXT: movw %ax, 12(%rdi)
-; AVX512VL-NEXT: movw %r11w, 10(%rdi)
-; AVX512VL-NEXT: movw %r10w, 6(%rdi)
-; AVX512VL-NEXT: movw %r9w, 4(%rdi)
-; AVX512VL-NEXT: movw %r8w, 2(%rdi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; ALL-LABEL: store_cvt_8f32_to_8i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %r8d
+; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %r9d
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %r10d
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; ALL-NEXT: vmovd %xmm2, %r11d
+; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; ALL-NEXT: vmovd %xmm2, %eax
+; ALL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; ALL-NEXT: vmovd %xmm2, %ecx
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; ALL-NEXT: vmovd %xmm0, %esi
+; ALL-NEXT: movw %si, 8(%rdi)
+; ALL-NEXT: movw %dx, (%rdi)
+; ALL-NEXT: movw %cx, 14(%rdi)
+; ALL-NEXT: movw %ax, 12(%rdi)
+; ALL-NEXT: movw %r11w, 10(%rdi)
+; ALL-NEXT: movw %r10w, 6(%rdi)
+; ALL-NEXT: movw %r9w, 4(%rdi)
+; ALL-NEXT: movw %r8w, 2(%rdi)
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%1 = fptrunc <8 x float> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
store <8 x i16> %2, <8 x i16>* %a1
@@ -4141,141 +3017,73 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: store_cvt_16f32_to_16i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm4
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm4
-; AVX512F-NEXT: movw %ax, 24(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm4
-; AVX512F-NEXT: movw %ax, 16(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm4
-; AVX512F-NEXT: movw %ax, 8(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4
-; AVX512F-NEXT: movw %ax, (%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4
-; AVX512F-NEXT: movw %ax, 30(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: movw %ax, 28(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: movw %ax, 26(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: movw %ax, 22(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: movw %ax, 20(%rdi)
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: movw %ax, 18(%rdi)
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: movw %ax, 14(%rdi)
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movw %ax, 12(%rdi)
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: movw %ax, 10(%rdi)
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: movw %ax, 6(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: movw %ax, 4(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: movw %ax, 2(%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_16f32_to_16i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm4
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm4
-; AVX512VL-NEXT: movw %ax, 24(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm4
-; AVX512VL-NEXT: movw %ax, 16(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm4
-; AVX512VL-NEXT: movw %ax, 8(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; AVX512VL-NEXT: movw %ax, (%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; AVX512VL-NEXT: movw %ax, 30(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: movw %ax, 28(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: movw %ax, 26(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: movw %ax, 22(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: movw %ax, 20(%rdi)
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: movw %ax, 18(%rdi)
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: movw %ax, 14(%rdi)
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: movw %ax, 12(%rdi)
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: movw %ax, 10(%rdi)
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: movw %ax, 6(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: movw %ax, 4(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: movw %ax, 2(%rdi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: store_cvt_16f32_to_16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4
+; AVX512-NEXT: movw %ax, 24(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4
+; AVX512-NEXT: movw %ax, 16(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4
+; AVX512-NEXT: movw %ax, 8(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: movw %ax, (%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: movw %ax, 30(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 28(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 26(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 22(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: movw %ax, 20(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: movw %ax, 18(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: movw %ax, 14(%rdi)
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: movw %ax, 12(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: movw %ax, 10(%rdi)
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: movw %ax, 6(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: movw %ax, 4(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: movw %ax, 2(%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = fptrunc <16 x float> %a0 to <16 x half>
%2 = bitcast <16 x half> %1 to <16 x i16>
store <16 x i16> %2, <16 x i16>* %a1
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index cd4b237735f..25377f26799 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -3333,11 +3333,17 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; AVX1-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: .cfi_def_cfa_offset 48
; AVX1-NEXT: popq %r12
+; AVX1-NEXT: .cfi_def_cfa_offset 40
; AVX1-NEXT: popq %r13
+; AVX1-NEXT: .cfi_def_cfa_offset 32
; AVX1-NEXT: popq %r14
+; AVX1-NEXT: .cfi_def_cfa_offset 24
; AVX1-NEXT: popq %r15
+; AVX1-NEXT: .cfi_def_cfa_offset 16
; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: .cfi_def_cfa_offset 8
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_16i1_to_16i16:
@@ -3424,11 +3430,17 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; AVX2-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: .cfi_def_cfa_offset 48
; AVX2-NEXT: popq %r12
+; AVX2-NEXT: .cfi_def_cfa_offset 40
; AVX2-NEXT: popq %r13
+; AVX2-NEXT: .cfi_def_cfa_offset 32
; AVX2-NEXT: popq %r14
+; AVX2-NEXT: .cfi_def_cfa_offset 24
; AVX2-NEXT: popq %r15
+; AVX2-NEXT: .cfi_def_cfa_offset 16
; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: .cfi_def_cfa_offset 8
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_sext_16i1_to_16i16:
@@ -4824,6 +4836,7 @@ define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0
; X32-SSE41-NEXT: movd %xmm0, %eax
; X32-SSE41-NEXT: popl %ecx
+; X32-SSE41-NEXT: .cfi_def_cfa_offset 4
; X32-SSE41-NEXT: retl
entry:
%Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index dd329d21dc9..7ef5bee5420 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -3963,10 +3963,20 @@ define <16 x i16> @concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31(<16 x i
}
define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc(<16 x i16> %a, <16 x i16> %b) {
-; ALL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
-; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT: retq
%ahi = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%bc0hi = bitcast <8 x i16> %ahi to <16 x i8>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index cf1aaca4ee2..56567c7e794 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1053,8 +1053,8 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-LABEL: shuffle_v4i64_3254:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4>
ret <4 x i64> %shuffle
@@ -1075,8 +1075,8 @@ define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-LABEL: shuffle_v4i64_3276:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6>
ret <4 x i64> %shuffle
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index b95e7cf008a..e4234c05845 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -1789,21 +1789,33 @@ define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) {
-; ALL-LABEL: shuffle_v8i32_7654fedc:
-; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i32_7654fedc:
+; AVX1OR2: # BB#0:
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_7654fedc:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) {
-; ALL-LABEL: shuffle_v8i32_fedc7654:
-; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i32_fedc7654:
+; AVX1OR2: # BB#0:
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_fedc7654:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
}
@@ -2177,10 +2189,15 @@ define <8 x i32> @concat_v8i32_0123CDEF(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @concat_v8i32_4567CDEF_bc(<8 x i32> %a0, <8 x i32> %a1) {
-; ALL-LABEL: concat_v8i32_4567CDEF_bc:
-; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: concat_v8i32_4567CDEF_bc:
+; AVX1OR2: # BB#0:
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: concat_v8i32_4567CDEF_bc:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT: retq
%a0hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%a1hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
%bc0hi = bitcast <4 x i32> %a0hi to <2 x i64>
diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 6c980559721..1d17ef109d2 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -1165,14 +1165,31 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_01014545:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_01014545:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-32-NEXT: retl
+
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01014545_mem(<8 x i64>* %ptr, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8i64_01014545_mem:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_01014545_mem:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5]
; AVX512F-32-NEXT: retl
+ %a = load <8 x i64>, <8 x i64>* %ptr
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x i64> %shuffle
}
diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll
index efbe5586747..b107b60cd6d 100644
--- a/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -619,6 +619,7 @@ define <64 x i8> @test_mm512_mask_blend_epi8(<64 x i8> %A, <64 x i8> %W){
; KNL32-NEXT: vpblendvb %ymm3, 8(%ebp), %ymm1, %ymm1
; KNL32-NEXT: movl %ebp, %esp
; KNL32-NEXT: popl %ebp
+; KNL32-NEXT: .cfi_def_cfa %esp, 4
; KNL32-NEXT: retl
entry:
%0 = shufflevector <64 x i8> %A, <64 x i8> %W, <64 x i32> <i32 64, i32 1, i32 66, i32 3, i32 68, i32 5, i32 70, i32 7, i32 72, i32 9, i32 74, i32 11, i32 76, i32 13, i32 78, i32 15, i32 80, i32 17, i32 82, i32 19, i32 84, i32 21, i32 86, i32 23, i32 88, i32 25, i32 90, i32 27, i32 92, i32 29, i32 94, i32 31, i32 96, i32 33, i32 98, i32 35, i32 100, i32 37, i32 102, i32 39, i32 104, i32 41, i32 106, i32 43, i32 108, i32 45, i32 110, i32 47, i32 112, i32 49, i32 114, i32 51, i32 116, i32 53, i32 118, i32 55, i32 120, i32 57, i32 122, i32 59, i32 124, i32 61, i32 126, i32 63>
@@ -659,6 +660,7 @@ define <32 x i16> @test_mm512_mask_blend_epi16(<32 x i16> %A, <32 x i16> %W){
; KNL32-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15]
; KNL32-NEXT: movl %ebp, %esp
; KNL32-NEXT: popl %ebp
+; KNL32-NEXT: .cfi_def_cfa %esp, 4
; KNL32-NEXT: retl
entry:
%0 = shufflevector <32 x i16> %A, <32 x i16> %W, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll
index 8d057290085..0e690347a54 100644
--- a/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -630,6 +630,7 @@ define i64 @shuf64i1_zero(i64 %a) {
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -662,6 +663,7 @@ define i64 @shuf64i1_zero(i64 %a) {
; AVX512VL-NEXT: orq %rcx, %rax
; AVX512VL-NEXT: movq %rbp, %rsp
; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: .cfi_def_cfa %rsp, 8
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index dc08d88074d..ac1083ad447 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -813,13 +813,10 @@ define void @trunc16i32_16i16_lshr(<16 x i32> %a) {
;
; AVX2-LABEL: trunc16i32_16i16_lshr:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -947,28 +944,52 @@ entry:
}
define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
-; SSE-LABEL: trunc16i32_16i8_lshr:
-; SSE: # BB#0: # %entry
-; SSE-NEXT: psrld $24, %xmm1
-; SSE-NEXT: psrld $24, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: psrld $24, %xmm3
-; SSE-NEXT: psrld $24, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: movdqu %xmm0, (%rax)
-; SSE-NEXT: retq
+; SSE2-LABEL: trunc16i32_16i8_lshr:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: psrld $24, %xmm1
+; SSE2-NEXT: psrld $24, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: psrld $24, %xmm3
+; SSE2-NEXT: psrld $24, %xmm2
+; SSE2-NEXT: packuswb %xmm3, %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: movdqu %xmm0, (%rax)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: trunc16i32_16i8_lshr:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: psrld $24, %xmm1
+; SSSE3-NEXT: psrld $24, %xmm0
+; SSSE3-NEXT: packuswb %xmm1, %xmm0
+; SSSE3-NEXT: psrld $24, %xmm3
+; SSSE3-NEXT: psrld $24, %xmm2
+; SSSE3-NEXT: packuswb %xmm3, %xmm2
+; SSSE3-NEXT: packuswb %xmm2, %xmm0
+; SSSE3-NEXT: movdqu %xmm0, (%rax)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: trunc16i32_16i8_lshr:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: psrld $24, %xmm1
+; SSE41-NEXT: psrld $24, %xmm0
+; SSE41-NEXT: packssdw %xmm1, %xmm0
+; SSE41-NEXT: psrld $24, %xmm3
+; SSE41-NEXT: psrld $24, %xmm2
+; SSE41-NEXT: packssdw %xmm3, %xmm2
+; SSE41-NEXT: packuswb %xmm2, %xmm0
+; SSE41-NEXT: movdqu %xmm0, (%rax)
+; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc16i32_16i8_lshr:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
; AVX1-NEXT: vzeroupper
@@ -976,16 +997,12 @@ define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
;
; AVX2-LABEL: trunc16i32_16i8_lshr:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0
+; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/test/CodeGen/X86/wide-integer-cmp.ll b/test/CodeGen/X86/wide-integer-cmp.ll
index 97460b36a74..9bd53c6fbd3 100644
--- a/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/test/CodeGen/X86/wide-integer-cmp.ll
@@ -105,10 +105,13 @@ define i32 @test_wide(i128 %a, i128 %b) {
; CHECK-NEXT: # BB#1: # %bb1
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB4_2: # %bb2
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: movl $2, %eax
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
entry:
%cmp = icmp slt i128 %a, %b
diff --git a/test/CodeGen/X86/x86-framelowering-trap.ll b/test/CodeGen/X86/x86-framelowering-trap.ll
index f1590abcae8..89f4528fb06 100644
--- a/test/CodeGen/X86/x86-framelowering-trap.ll
+++ b/test/CodeGen/X86/x86-framelowering-trap.ll
@@ -6,6 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK: pushq
; CHECK: ud2
; CHECK-NEXT: popq
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
define void @bar() {
entry:
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index acad9f771fc..bc6a6ea205c 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1816,6 +1816,7 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
; AVX1-NEXT: vmovaps %ymm9, 64(%rdi)
; AVX1-NEXT: vmovaps %ymm8, (%rdi)
; AVX1-NEXT: addq $24, %rsp
+; AVX1-NEXT: .cfi_def_cfa_offset 8
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
diff --git a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
index 763d764698d..929dafbfc21 100644
--- a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
+++ b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
@@ -20,6 +20,7 @@ define x86_64_sysvcc i32 @bar(i32 %a0, i32 %a1, float %b0) #0 {
; CHECK-NEXT: movl $4, %eax
; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: popq %rdx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
call void asm sideeffect "", "~{rax},~{rdx},~{xmm1},~{rdi},~{rsi},~{xmm0}"()
ret i32 4
diff --git a/test/DebugInfo/AArch64/inlined-argument.ll b/test/DebugInfo/AArch64/inlined-argument.ll
new file mode 100644
index 00000000000..868efc28f6a
--- /dev/null
+++ b/test/DebugInfo/AArch64/inlined-argument.ll
@@ -0,0 +1,140 @@
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump --name resource - | FileCheck %s
+; CHECK: DW_TAG_formal_parameter
+; CHECK: DW_TAG_formal_parameter
+; CHECK-NEXT: DW_AT_location (DW_OP_reg1 W1)
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}}"resource"
+;
+; Generated from:
+; typedef struct t *t_t;
+; extern unsigned int enable;
+; struct t {
+; struct q {
+; struct q *next;
+; unsigned long long resource;
+; } * s;
+; } * tt;
+; static unsigned long find(t_t t, unsigned long long resource) {
+; struct q *q;
+; q = t->s;
+; while (q) {
+; if (q->resource == resource)
+; return q;
+; q = q->next;
+; }
+; }
+; int g(t_t t, unsigned long long r) {
+; struct q *q;
+; q = find(t, r);
+; if (!q)
+; if (__builtin_expect(enable, 0)) { }
+; }
+
+
+source_filename = "test.i"
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios5.0.0"
+
+%struct.t = type { %struct.q* }
+%struct.q = type { %struct.q*, i64 }
+
+@tt = local_unnamed_addr global %struct.t* null, align 8, !dbg !0
+
+; Function Attrs: noredzone nounwind readonly ssp
+define i32 @g(%struct.t* nocapture readonly %t, i64 %r) local_unnamed_addr #0 !dbg !20 {
+entry:
+ tail call void @llvm.dbg.value(metadata %struct.t* %t, metadata !26, metadata !DIExpression()), !dbg !29
+ tail call void @llvm.dbg.value(metadata i64 %r, metadata !27, metadata !DIExpression()), !dbg !30
+ tail call void @llvm.dbg.value(metadata %struct.t* %t, metadata !31, metadata !DIExpression()), !dbg !39
+ tail call void @llvm.dbg.value(metadata i64 %r, metadata !37, metadata !DIExpression()), !dbg !41
+ %s.i5 = bitcast %struct.t* %t to %struct.q**
+ tail call void @llvm.dbg.value(metadata %struct.q** %s.i5, metadata !38, metadata !DIExpression(DW_OP_deref)), !dbg !42
+ %q.06.i = load %struct.q*, %struct.q** %s.i5, align 8
+ tail call void @llvm.dbg.value(metadata %struct.q* %q.06.i, metadata !38, metadata !DIExpression()), !dbg !42
+ %tobool7.i = icmp eq %struct.q* %q.06.i, null, !dbg !43
+ br i1 %tobool7.i, label %find.exit, label %while.body.i.preheader, !dbg !43
+
+while.body.i.preheader: ; preds = %entry
+ br label %while.body.i, !dbg !44
+
+while.body.i: ; preds = %while.body.i.preheader, %if.end.i
+ %q.08.i = phi %struct.q* [ %q.0.i, %if.end.i ], [ %q.06.i, %while.body.i.preheader ]
+ %resource1.i = getelementptr inbounds %struct.q, %struct.q* %q.08.i, i64 0, i32 1, !dbg !44
+ %0 = load i64, i64* %resource1.i, align 8, !dbg !44
+ %cmp.i = icmp eq i64 %0, %r, !dbg !47
+ br i1 %cmp.i, label %find.exit, label %if.end.i, !dbg !48
+
+if.end.i: ; preds = %while.body.i
+ %next.i6 = bitcast %struct.q* %q.08.i to %struct.q**
+ tail call void @llvm.dbg.value(metadata %struct.q** %next.i6, metadata !38, metadata !DIExpression(DW_OP_deref)), !dbg !42
+ %q.0.i = load %struct.q*, %struct.q** %next.i6, align 8
+ tail call void @llvm.dbg.value(metadata %struct.q* %q.0.i, metadata !38, metadata !DIExpression()), !dbg !42
+ %tobool.i = icmp eq %struct.q* %q.0.i, null, !dbg !43
+ br i1 %tobool.i, label %find.exit, label %while.body.i, !dbg !43, !llvm.loop !49
+
+find.exit: ; preds = %while.body.i, %if.end.i, %entry
+ ret i32 undef, !dbg !52
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { noredzone nounwind readonly ssp }
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!16, !17, !18}
+!llvm.ident = !{!19}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "tt", scope: !2, file: !3, line: 8, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 6.0.0 (trunk 317516) (llvm/trunk 317518)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "test.i", directory: "/")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64)
+!7 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t", file: !3, line: 3, size: 64, elements: !8)
+!8 = !{!9}
+!9 = !DIDerivedType(tag: DW_TAG_member, name: "s", scope: !7, file: !3, line: 7, baseType: !10, size: 64)
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "q", file: !3, line: 4, size: 128, elements: !12)
+!12 = !{!13, !14}
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "next", scope: !11, file: !3, line: 5, baseType: !10, size: 64)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "resource", scope: !11, file: !3, line: 6, baseType: !15, size: 64, offset: 64)
+!15 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!16 = !{i32 2, !"Dwarf Version", i32 2}
+!17 = !{i32 2, !"Debug Info Version", i32 3}
+!18 = !{i32 1, !"wchar_size", i32 4}
+!19 = !{!"clang version 6.0.0 (trunk 317516) (llvm/trunk 317518)"}
+!20 = distinct !DISubprogram(name: "g", scope: !3, file: !3, line: 18, type: !21, isLocal: false, isDefinition: true, scopeLine: 18, flags: DIFlagPrototyped, isOptimized: true, unit: !2, variables: !25)
+!21 = !DISubroutineType(types: !22)
+!22 = !{!23, !24, !15}
+!23 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!24 = !DIDerivedType(tag: DW_TAG_typedef, name: "t_t", file: !3, line: 1, baseType: !6)
+!25 = !{!26, !27, !28}
+!26 = !DILocalVariable(name: "t", arg: 1, scope: !20, file: !3, line: 18, type: !24)
+!27 = !DILocalVariable(name: "r", arg: 2, scope: !20, file: !3, line: 18, type: !15)
+!28 = !DILocalVariable(name: "q", scope: !20, file: !3, line: 19, type: !10)
+!29 = !DILocation(line: 18, column: 11, scope: !20)
+!30 = !DILocation(line: 18, column: 33, scope: !20)
+!31 = !DILocalVariable(name: "t", arg: 1, scope: !32, file: !3, line: 9, type: !24)
+!32 = distinct !DISubprogram(name: "find", scope: !3, file: !3, line: 9, type: !33, isLocal: true, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !2, variables: !36)
+!33 = !DISubroutineType(types: !34)
+!34 = !{!35, !24, !15}
+!35 = !DIBasicType(name: "long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!36 = !{!31, !37, !38}
+!37 = !DILocalVariable(name: "resource", arg: 2, scope: !32, file: !3, line: 9, type: !15)
+!38 = !DILocalVariable(name: "q", scope: !32, file: !3, line: 10, type: !10)
+!39 = !DILocation(line: 9, column: 31, scope: !32, inlinedAt: !40)
+!40 = distinct !DILocation(line: 20, column: 7, scope: !20)
+!41 = !DILocation(line: 9, column: 53, scope: !32, inlinedAt: !40)
+!42 = !DILocation(line: 10, column: 13, scope: !32, inlinedAt: !40)
+!43 = !DILocation(line: 12, column: 3, scope: !32, inlinedAt: !40)
+!44 = !DILocation(line: 13, column: 12, scope: !45, inlinedAt: !40)
+!45 = distinct !DILexicalBlock(scope: !46, file: !3, line: 13, column: 9)
+!46 = distinct !DILexicalBlock(scope: !32, file: !3, line: 12, column: 13)
+!47 = !DILocation(line: 13, column: 21, scope: !45, inlinedAt: !40)
+!48 = !DILocation(line: 13, column: 9, scope: !46, inlinedAt: !40)
+!49 = distinct !{!49, !50, !51}
+!50 = !DILocation(line: 12, column: 3, scope: !32)
+!51 = !DILocation(line: 16, column: 3, scope: !32)
+!52 = !DILocation(line: 24, column: 1, scope: !20)
diff --git a/test/DebugInfo/ARM/illegal-fragment.ll b/test/DebugInfo/ARM/illegal-fragment.ll
new file mode 100644
index 00000000000..41e28faa708
--- /dev/null
+++ b/test/DebugInfo/ARM/illegal-fragment.ll
@@ -0,0 +1,95 @@
+; RUN: llc -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s
+; CHECK: file format Mach-O arm
+; ModuleID = 'test.ll'
+source_filename = "test.i"
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7s-apple-ios5.0.0"
+
+%struct.vm_object = type { i64 }
+
+; Function Attrs: nounwind ssp
+define void @f(%struct.vm_object* %object, i64* nocapture readonly %start) local_unnamed_addr #0 !dbg !11 {
+entry:
+ tail call void @llvm.dbg.value(metadata %struct.vm_object* %object, metadata !21, metadata !DIExpression()), !dbg !27
+ tail call void @llvm.dbg.value(metadata i64* %start, metadata !22, metadata !DIExpression()), !dbg !28
+ tail call void @llvm.dbg.value(metadata i64 %0, metadata !25, metadata !DIExpression()), !dbg !29
+ tail call void @llvm.dbg.value(metadata i64 %0, metadata !26, metadata !DIExpression(DW_OP_constu, 4096, DW_OP_minus, DW_OP_stack_value)), !dbg !30
+ ; This debug value cannot safely be split into two 32-bit pieces.
+ ; CHECK-NOT: DW_AT_name(offset)
+ tail call void @llvm.dbg.value(metadata i32 undef, metadata !23, metadata !DIExpression()), !dbg !31
+ br i1 undef, label %for.end, label %for.body.lr.ph, !dbg !31
+
+for.body.lr.ph: ; preds = %entry
+ %0 = load i64, i64* %start, align 4, !dbg !33
+ br label %for.body, !dbg !31
+
+for.body: ; preds = %for.body, %for.body.lr.ph
+ %offset.010.in = phi i64 [ %0, %for.body.lr.ph ], [ %offset.010, %for.body ]
+ %head_size.09 = phi i32 [ undef, %for.body.lr.ph ], [ %sub2, %for.body ]
+ %offset.010 = add i64 %offset.010.in, -4096
+ tail call void @llvm.dbg.value(metadata i32 %head_size.09, metadata !23, metadata !DIExpression()), !dbg !30
+ %call = tail call i32 bitcast (i32 (...)* @use to i32 (i64, %struct.vm_object*)*)(i64 %offset.010, %struct.vm_object* %object) #2, !dbg !34
+ %sub2 = add i32 %head_size.09, -4096, !dbg !37
+ tail call void @llvm.dbg.value(metadata i64 %offset.010, metadata !26, metadata !DIExpression(DW_OP_constu, 4096, DW_OP_minus, DW_OP_stack_value)), !dbg !29
+ tail call void @llvm.dbg.value(metadata i32 %sub2, metadata !23, metadata !DIExpression()), !dbg !30
+ %tobool = icmp eq i32 %sub2, 0, !dbg !31
+ br i1 %tobool, label %for.end, label %for.body, !dbg !31, !llvm.loop !38
+
+for.end: ; preds = %for.body, %entry
+ ret void, !dbg !40
+}
+
+declare i32 @use(...) local_unnamed_addr
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind ssp }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { nobuiltin nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!5, !6, !7, !8, !9}
+!llvm.ident = !{!10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 (trunk 317434) (llvm/trunk 317437)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "test.i", directory: "/Data/radar/31209283")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!5 = !{i32 2, !"Dwarf Version", i32 2}
+!6 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = !{i32 1, !"wchar_size", i32 4}
+!8 = !{i32 1, !"min_enum_size", i32 4}
+!9 = !{i32 7, !"PIC Level", i32 2}
+!10 = !{!"clang version 6.0.0 (trunk 317434) (llvm/trunk 317437)"}
+!11 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 6, type: !12, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !20)
+!12 = !DISubroutineType(types: !13)
+!13 = !{null, !14, !19}
+!14 = !DIDerivedType(tag: DW_TAG_typedef, name: "v_t", file: !1, line: 1, baseType: !15)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 32)
+!16 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "v", file: !1, line: 2, size: 64, elements: !17)
+!17 = !{!18}
+!18 = !DIDerivedType(tag: DW_TAG_member, name: "p", scope: !16, file: !1, line: 3, baseType: !4, size: 64)
+!19 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 32)
+!20 = !{!21, !22, !23, !25, !26}
+!21 = !DILocalVariable(name: "object", arg: 1, scope: !11, file: !1, line: 6, type: !14)
+!22 = !DILocalVariable(name: "start", arg: 2, scope: !11, file: !1, line: 6, type: !19)
+!23 = !DILocalVariable(name: "head_size", scope: !11, file: !1, line: 7, type: !24)
+!24 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!25 = !DILocalVariable(name: "orig_start", scope: !11, file: !1, line: 8, type: !4)
+!26 = !DILocalVariable(name: "offset", scope: !11, file: !1, line: 9, type: !4)
+!27 = !DILocation(line: 6, column: 20, scope: !11)
+!28 = !DILocation(line: 6, column: 48, scope: !11)
+!29 = !DILocation(line: 7, column: 12, scope: !11)
+!30 = !DILocation(line: 10, column: 16, scope: !11)
+!31 = !DILocation(line: 11, column: 5, scope: !32)
+!32 = distinct !DILexicalBlock(scope: !11, file: !1, line: 11, column: 5)
+!33 = !DILocation(line: 8, column: 22, scope: !11)
+!34 = !DILocation(line: 13, column: 7, scope: !35)
+!35 = distinct !DILexicalBlock(scope: !36, file: !1, line: 12, column: 75)
+!36 = distinct !DILexicalBlock(scope: !32, file: !1, line: 11, column: 5)
+!37 = !DILocation(line: 12, column: 61, scope: !36)
+!38 = distinct !{!38, !31, !39}
+!39 = !DILocation(line: 14, column: 3, scope: !32)
+!40 = !DILocation(line: 15, column: 1, scope: !11)
diff --git a/test/DebugInfo/ARM/salvage-debug-info.ll b/test/DebugInfo/ARM/salvage-debug-info.ll
new file mode 100644
index 00000000000..5509b92a5c1
--- /dev/null
+++ b/test/DebugInfo/ARM/salvage-debug-info.ll
@@ -0,0 +1,118 @@
+; RUN: opt -codegenprepare -S %s -o - | FileCheck %s
+; typedef struct info {
+; unsigned long long size;
+; } info_t;
+; extern unsigned p;
+; extern unsigned n;
+; void f() {
+; unsigned int i;
+; if (p) {
+; info_t *info = (info_t *)p;
+; for (i = 0; i < n; i++)
+; use(info[i].size);
+; }
+; }
+source_filename = "debug.i"
+target datalayout = "e-m:o-p:32:32-i64:64-a:0:32-n32-S128"
+target triple = "thumbv7k-apple-ios10.0.0"
+
+%struct.info = type { i64 }
+
+@p = external local_unnamed_addr global i32, align 4
+@n = external local_unnamed_addr global i32, align 4
+
+; Function Attrs: nounwind ssp uwtable
+define void @f() local_unnamed_addr #0 !dbg !16 {
+entry:
+ %0 = load i32, i32* @p, align 4, !dbg !25
+ %tobool = icmp eq i32 %0, 0, !dbg !25
+ br i1 %tobool, label %if.end, label %if.then, !dbg !26
+
+if.then: ; preds = %entry
+ %1 = inttoptr i32 %0 to %struct.info*, !dbg !27
+ tail call void @llvm.dbg.value(metadata %struct.info* %1, metadata !22, metadata !DIExpression()), !dbg !28
+ ; CHECK: call void @llvm.dbg.value(metadata i32 %0, metadata !22, metadata !DIExpression())
+ tail call void @llvm.dbg.value(metadata i32 0, metadata !20, metadata !DIExpression()), !dbg !29
+ %2 = load i32, i32* @n, align 4, !dbg !30
+ %cmp5 = icmp eq i32 %2, 0, !dbg !33
+ br i1 %cmp5, label %if.end, label %for.body.preheader, !dbg !34
+
+for.body.preheader: ; preds = %if.then
+ ; CHECK: for.body.preheader:
+ ; CHECK: %2 = inttoptr i32 %0 to %struct.info*
+ br label %for.body, !dbg !35
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %lsr.iv = phi %struct.info* [ %1, %for.body.preheader ], [ %scevgep, %for.body ]
+ %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %lsr.iv7 = bitcast %struct.info* %lsr.iv to i64*
+ tail call void @llvm.dbg.value(metadata i32 %i.06, metadata !20, metadata !DIExpression()), !dbg !29
+ %3 = load i64, i64* %lsr.iv7, align 8, !dbg !35
+ %call = tail call i32 bitcast (i32 (...)* @use to i32 (i64)*)(i64 %3) #3, !dbg !36
+ %inc = add nuw i32 %i.06, 1, !dbg !37
+ tail call void @llvm.dbg.value(metadata i32 %inc, metadata !20, metadata !DIExpression()), !dbg !29
+ %4 = load i32, i32* @n, align 4, !dbg !30
+ %scevgep = getelementptr %struct.info, %struct.info* %lsr.iv, i32 1, !dbg !33
+ %cmp = icmp ult i32 %inc, %4, !dbg !33
+ br i1 %cmp, label %for.body, label %if.end.loopexit, !dbg !34, !llvm.loop !38
+
+if.end.loopexit: ; preds = %for.body
+ br label %if.end, !dbg !40
+
+if.end: ; preds = %if.end.loopexit, %if.then, %entry
+ ret void, !dbg !40
+}
+declare i32 @use(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+attributes #0 = { nounwind ssp uwtable }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { nobuiltin nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10, !11, !12, !13, !14}
+!llvm.ident = !{!15}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "debug.i", directory: "/Data/radar/35321562")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 32)
+!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "info_t", file: !1, line: 3, baseType: !6)
+!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "info", file: !1, line: 1, size: 64, elements: !7)
+!7 = !{!8}
+!8 = !DIDerivedType(tag: DW_TAG_member, name: "size", scope: !6, file: !1, line: 2, baseType: !9, size: 64)
+!9 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!10 = !{i32 2, !"Dwarf Version", i32 4}
+!11 = !{i32 2, !"Debug Info Version", i32 3}
+!12 = !{i32 1, !"wchar_size", i32 4}
+!13 = !{i32 1, !"min_enum_size", i32 4}
+!14 = !{i32 7, !"PIC Level", i32 2}
+!15 = !{!"clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)"}
+!16 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 6, type: !17, isLocal: false, isDefinition: true, scopeLine: 6, isOptimized: true, unit: !0, variables: !19)
+!17 = !DISubroutineType(types: !18)
+!18 = !{null}
+!19 = !{!20, !22}
+!20 = !DILocalVariable(name: "i", scope: !16, file: !1, line: 7, type: !21)
+!21 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!22 = !DILocalVariable(name: "info", scope: !23, file: !1, line: 9, type: !4)
+!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 8, column: 10)
+!24 = distinct !DILexicalBlock(scope: !16, file: !1, line: 8, column: 7)
+!25 = !DILocation(line: 8, column: 7, scope: !24)
+!26 = !DILocation(line: 8, column: 7, scope: !16)
+!27 = !DILocation(line: 9, column: 20, scope: !23)
+!28 = !DILocation(line: 9, column: 13, scope: !23)
+!29 = !DILocation(line: 7, column: 16, scope: !16)
+!30 = !DILocation(line: 10, column: 21, scope: !31)
+!31 = distinct !DILexicalBlock(scope: !32, file: !1, line: 10, column: 5)
+!32 = distinct !DILexicalBlock(scope: !23, file: !1, line: 10, column: 5)
+!33 = !DILocation(line: 10, column: 19, scope: !31)
+!34 = !DILocation(line: 10, column: 5, scope: !32)
+!35 = !DILocation(line: 11, column: 19, scope: !31)
+!36 = !DILocation(line: 11, column: 7, scope: !31)
+!37 = !DILocation(line: 10, column: 25, scope: !31)
+!38 = distinct !{!38, !34, !39}
+!39 = !DILocation(line: 11, column: 23, scope: !32)
+!40 = !DILocation(line: 13, column: 1, scope: !16)
diff --git a/test/DebugInfo/Generic/location-verifier.ll b/test/DebugInfo/Generic/location-verifier.ll
index b1e0805428c..3c6bb425a66 100644
--- a/test/DebugInfo/Generic/location-verifier.ll
+++ b/test/DebugInfo/Generic/location-verifier.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as -disable-output -verify-debug-info -o - < %s 2>&1 | FileCheck %s
+; RUN: llvm-as -disable-output -o - < %s 2>&1 | FileCheck %s
; ModuleID = 'test.c'
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.10.0"
diff --git a/test/DebugInfo/Generic/missing-abstract-variable.ll b/test/DebugInfo/Generic/missing-abstract-variable.ll
index 16dcdebd1f1..8d5aff4084d 100644
--- a/test/DebugInfo/Generic/missing-abstract-variable.ll
+++ b/test/DebugInfo/Generic/missing-abstract-variable.ll
@@ -2,11 +2,6 @@
; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -v -debug-info - | FileCheck %s
-; The formal parameter 'b' for Function 'x' when inlined within 'a' is lost on
-; mips and powerpc64 (and on x86_64 at at least -O2). Presumably this is a
-; SelectionDAG issue (do mips/powerpc64 use FastISel?).
-; XFAIL: mips, powerpc64, s390x, sparc
-
; Build from the following source with clang -O2.
; The important details are that 'x's abstract definition is first built during
diff --git a/test/DebugInfo/Inputs/dwarfdump-header.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-header.elf-x86-64
deleted file mode 100644
index 21c1eacd071..00000000000
--- a/test/DebugInfo/Inputs/dwarfdump-header.elf-x86-64
+++ /dev/null
Binary files differ
diff --git a/test/DebugInfo/X86/dwarfdump-header-64.s b/test/DebugInfo/X86/dwarfdump-header-64.s
new file mode 100644
index 00000000000..f0baa592d8d
--- /dev/null
+++ b/test/DebugInfo/X86/dwarfdump-header-64.s
@@ -0,0 +1,149 @@
+# Test object to verify dwarfdump handles a DWARF-64 v5 line header.
+# FIXME: Make the other headers DWARF-64 also.
+# FIXME: Add variants for earlier DWARF versions.
+
+# Lines beginning with @ELF@ should be preserved for ELF targets;
+# lines beginning with @MACHO@ should be preserved for Mach-O targets.
+
+# RUN: sed -e 's/@ELF@//;s/@MACHO@.*//' %s | \
+# RUN: llvm-mc -triple x86_64-unknown-linux -filetype=obj -o - | \
+# RUN: llvm-dwarfdump -v - | FileCheck %s
+
+# RUN: sed -e 's/@ELF@.*//;s/@MACHO@//' %s | \
+# RUN: llvm-mc -triple x86_64-apple-darwin -filetype=obj -o - | \
+# RUN: llvm-dwarfdump -v - | FileCheck %s
+
+
+@ELF@ .section .debug_str,"MS",@progbits,1
+@MACHO@ .section __DWARF,__debug_str,regular,debug
+str_producer:
+ .asciz "Handmade DWARF producer"
+str_CU_5:
+ .asciz "V5_compile_unit"
+str_LT_5a:
+ .asciz "Directory5a"
+str_LT_5b:
+ .asciz "Directory5b"
+
+@ELF@ .section .debug_abbrev,"",@progbits
+@MACHO@ .section __DWARF,__debug_abbrev,regular,debug
+abbrev:
+ .byte 0x01 # Abbrev code
+ .byte 0x11 # DW_TAG_compile_unit
+ .byte 0x00 # DW_CHILDREN_no
+ .byte 0x25 # DW_AT_producer
+ .byte 0x0e # DW_FORM_strp
+ .byte 0x03 # DW_AT_name
+ .byte 0x0e # DW_FORM_strp
+ .byte 0x10 # DW_AT_stmt_list
+ .byte 0x17 # DW_FORM_sec_offset
+ .byte 0x00 # EOM(1)
+ .byte 0x00 # EOM(2)
+
+@ELF@ .section .debug_info,"",@progbits
+@MACHO@ .section __DWARF,__debug_info,regular,debug
+
+# DWARF-32 v5 normal CU header.
+Lset0 = CU_5_end-CU_5_version # Length of Unit
+ .long Lset0
+CU_5_version:
+ .short 5 # DWARF version number
+ .byte 1 # DWARF Unit Type
+ .byte 8 # Address Size (in bytes)
+@ELF@ .long abbrev # Offset Into Abbrev. Section
+@MACHO@ .long 0
+# The compile-unit DIE, with DW_AT_producer, DW_AT_name, DW_AT_stmt_list.
+ .byte 1
+ .long str_producer
+ .long str_CU_5
+@ELF@ .long LH_5_start
+@MACHO@ .long 0
+ .byte 0 # NULL
+CU_5_end:
+
+# CHECK-LABEL: .debug_info contents:
+# CHECK: 0x00000000: Compile Unit: length = 0x00000016 version = 0x0005 unit_type = DW_UT_compile abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x0000001a)
+# CHECK: 0x0000000c: DW_TAG_compile_unit
+# CHECK-NEXT: DW_AT_producer {{.*}} "Handmade DWARF producer"
+# CHECK-NEXT: DW_AT_name {{.*}} "V5_compile_unit"
+# CHECK-NEXT: DW_AT_stmt_list {{.*}} (0x00000000)
+
+@ELF@ .section .debug_line,"",@progbits
+@MACHO@ .section __DWARF,__debug_line,regular,debug
+
+# DWARF-64 v5 line-table header.
+LH_5_start:
+ .long -1
+Lset1 = LH_5_end-LH_5_version # Length of Unit
+ .quad Lset1
+LH_5_version:
+ .short 5 # DWARF version number
+ .byte 8 # Address Size
+ .byte 0 # Segment Selector Size
+Lset2 = LH_5_header_end-LH_5_params # Length of Prologue
+ .quad Lset2
+LH_5_params:
+ .byte 1 # Minimum Instruction Length
+ .byte 1 # Maximum Operations per Instruction
+ .byte 1 # Default is_stmt
+ .byte -5 # Line Base
+ .byte 14 # Line Range
+ .byte 13 # Opcode Base
+ .byte 0 # Standard Opcode Lengths
+ .byte 1
+ .byte 1
+ .byte 1
+ .byte 1
+ .byte 0
+ .byte 0
+ .byte 0
+ .byte 1
+ .byte 0
+ .byte 0
+ .byte 1
+ # Directory table format
+ .byte 1 # One element per directory entry
+ .byte 1 # DW_LNCT_path
+ .byte 0x0e # DW_FORM_strp (-> .debug_str)
+ # Directory table entries
+ .byte 2 # Two directories
+ .quad str_LT_5a
+ .quad str_LT_5b
+ # File table format
+ .byte 4 # Four elements per file entry
+ .byte 1 # DW_LNCT_path
+ .byte 0x08 # DW_FORM_string
+ .byte 2 # DW_LNCT_directory_index
+ .byte 0x0b # DW_FORM_data1
+ .byte 3 # DW_LNCT_timestamp
+ .byte 0x0f # DW_FORM_udata
+ .byte 4 # DW_LNCT_size
+ .byte 0x0f # DW_FORM_udata
+ # File table entries
+ .byte 2 # Two files
+ .asciz "File5a"
+ .byte 1
+ .byte 0x51
+ .byte 0x52
+ .asciz "File5b"
+ .byte 2
+ .byte 0x53
+ .byte 0x54
+LH_5_header_end:
+ # Line number program, which is empty.
+LH_5_end:
+
+# CHECK-LABEL: .debug_line contents:
+# CHECK: Line table prologue:
+# CHECK: total_length: 0x00000050
+# CHECK: version: 5
+# CHECK: address_size: 8
+# CHECK: seg_select_size: 0
+# CHECK: prologue_length: 0x00000044
+# CHECK: max_ops_per_inst: 1
+# CHECK: include_directories[ 1] = 'Directory5a'
+# CHECK: include_directories[ 2] = 'Directory5b'
+# CHECK-NOT: include_directories
+# CHECK: file_names[ 1] 1 0x00000051 0x00000052 File5a{{$}}
+# CHECK: file_names[ 2] 2 0x00000053 0x00000054 File5b{{$}}
+# CHECK-NOT: file_names
diff --git a/test/DebugInfo/Inputs/dwarfdump-header.s b/test/DebugInfo/X86/dwarfdump-header.s
index c5cf4859776..d3d4e5a6827 100644
--- a/test/DebugInfo/Inputs/dwarfdump-header.s
+++ b/test/DebugInfo/X86/dwarfdump-header.s
@@ -2,9 +2,8 @@
# We have a representative set of units: v4 CU, v5 CU, v4 TU, v5 split TU.
# We have v4 and v5 line-table headers.
#
-# To generate the test object:
-# llvm-mc -triple x86_64-unknown-linux dwarfdump-header.s -filetype=obj \
-# -o dwarfdump-header.elf-x86-64
+# RUN: llvm-mc -triple x86_64-unknown-linux %s -filetype=obj -o - | \
+# RUN: llvm-dwarfdump -v - | FileCheck %s
.section .debug_str,"MS",@progbits,1
str_producer:
@@ -15,6 +14,10 @@ str_CU_5:
.asciz "V5_compile_unit"
str_TU_4:
.asciz "V4_type_unit"
+str_LT_5a:
+ .asciz "Directory5a"
+str_LT_5b:
+ .asciz "Directory5b"
.section .debug_str.dwo,"MS",@progbits,1
dwo_TU_5:
@@ -77,6 +80,7 @@ dwo_TU_5:
.byte 0x00 # EOM(3)
.section .debug_info,"",@progbits
+# CHECK-LABEL: .debug_info contents:
# DWARF v4 CU header. V4 CU headers all look the same so we do only one.
.long CU_4_end-CU_4_version # Length of Unit
@@ -92,6 +96,9 @@ CU_4_version:
.byte 0 # NULL
CU_4_end:
+# CHECK: 0x00000000: Compile Unit: length = 0x00000015 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000019)
+# CHECK: 0x0000000b: DW_TAG_compile_unit
+
# DWARF v5 normal CU header.
.long CU_5_end-CU_5_version # Length of Unit
CU_5_version:
@@ -107,7 +114,11 @@ CU_5_version:
.byte 0 # NULL
CU_5_end:
+# CHECK: 0x00000019: Compile Unit: length = 0x00000016 version = 0x0005 unit_type = DW_UT_compile abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000033)
+# CHECK: 0x00000025: DW_TAG_compile_unit
+
.section .debug_types,"",@progbits
+# CHECK-LABEL: .debug_types contents:
# DWARF v4 Type unit header. Normal/split are identical so we do only one.
TU_4_start:
@@ -129,8 +140,12 @@ TU_4_type:
.byte 0 # NULL
TU_4_end:
+# CHECK: 0x00000000: Type Unit: length = 0x0000001f version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 name = 'V4_type_unit' type_signature = 0x0011223344556677 type_offset = 0x001c (next unit at 0x00000023)
+# CHECK: 0x00000017: DW_TAG_type_unit
+
.section .debug_types.dwo,"",@progbits
# FIXME: DWARF v5 wants type units in .debug_info[.dwo] not .debug_types[.dwo].
+# CHECK: .debug_types.dwo contents:
# DWARF v5 split type unit header.
TU_split_5_start:
@@ -153,7 +168,12 @@ TU_split_5_type:
.byte 0 # NULL
TU_split_5_end:
+# CHECK: 0x00000000: Type Unit: length = 0x00000020 version = 0x0005 unit_type = DW_UT_split_type abbr_offset = 0x0000 addr_size = 0x08 name = 'V5_split_type_unit' type_signature = 0x8899aabbccddeeff type_offset = 0x001d (next unit at 0x00000024)
+# CHECK: 0x00000018: DW_TAG_type_unit
+
.section .debug_line,"",@progbits
+# CHECK-LABEL: .debug_line contents:
+
# DWARF v4 line-table header.
LH_4_start:
.long LH_4_end-LH_4_version # Length of Unit
@@ -197,6 +217,18 @@ LH_4_header_end:
# Line number program, which is empty.
LH_4_end:
+# CHECK: Line table prologue:
+# CHECK: version: 4
+# CHECK-NOT: address_size
+# CHECK-NOT: seg_select_size
+# CHECK: max_ops_per_inst: 1
+# CHECK: include_directories[ 1] = 'Directory4a'
+# CHECK: include_directories[ 2] = 'Directory4b'
+# CHECK-NOT: include_directories
+# CHECK: file_names[ 1] 1 0x00000041 0x00000042 File4a{{$}}
+# CHECK: file_names[ 2] 0 0x00000043 0x00000044 File4b{{$}}
+# CHECK-NOT: file_names
+
# DWARF v5 line-table header.
LH_5_start:
.long LH_5_end-LH_5_version # Length of Unit
@@ -227,11 +259,11 @@ LH_5_params:
# Directory table format
.byte 1 # One element per directory entry
.byte 1 # DW_LNCT_path
- .byte 0x08 # DW_FORM_string
+ .byte 0x0e # DW_FORM_strp (-> .debug_str)
# Directory table entries
.byte 2 # Two directories
- .asciz "Directory5a"
- .asciz "Directory5b"
+ .long str_LT_5a
+ .long str_LT_5b
# File table format
.byte 4 # Four elements per file entry
.byte 1 # DW_LNCT_path
@@ -255,3 +287,15 @@ LH_5_params:
LH_5_header_end:
# Line number program, which is empty.
LH_5_end:
+
+# CHECK: Line table prologue:
+# CHECK: version: 5
+# CHECK: address_size: 8
+# CHECK: seg_select_size: 0
+# CHECK: max_ops_per_inst: 1
+# CHECK: include_directories[ 1] = 'Directory5a'
+# CHECK: include_directories[ 2] = 'Directory5b'
+# CHECK-NOT: include_directories
+# CHECK: file_names[ 1] 1 0x00000051 0x00000052 File5a{{$}}
+# CHECK: file_names[ 2] 2 0x00000053 0x00000054 File5b{{$}}
+# CHECK-NOT: file_names
diff --git a/test/DebugInfo/X86/live-debug-variables.ll b/test/DebugInfo/X86/live-debug-variables.ll
index fbfd1d91a81..90669f5412c 100644
--- a/test/DebugInfo/X86/live-debug-variables.ll
+++ b/test/DebugInfo/X86/live-debug-variables.ll
@@ -24,8 +24,9 @@
; CHECK: .debug_loc contents:
; CHECK-NEXT: 0x00000000:
-; CHECK-NEXT: 0x000000000000001f - 0x000000000000003c: DW_OP_reg3 RBX
-; We should only have one entry
+; We currently emit an entry for the function prologue, too, which could be optimized away.
+; CHECK: 0x000000000000001f - 0x000000000000003c: DW_OP_reg3 RBX
+; We should only have one entry inside the function.
; CHECK-NOT: :
declare i32 @foobar(i32, i32, i32, i32, i32)
diff --git a/test/DebugInfo/dwarfdump-header.test b/test/DebugInfo/dwarfdump-header.test
deleted file mode 100644
index 375f7043c9f..00000000000
--- a/test/DebugInfo/dwarfdump-header.test
+++ /dev/null
@@ -1,60 +0,0 @@
-RUN: llvm-dwarfdump -v %p/Inputs/dwarfdump-header.elf-x86-64 | FileCheck %s
-RUN: llvm-dwarfdump -v --verify %p/Inputs/dwarfdump-header.elf-x86-64
-
-The input file is hand-coded assembler to generate all the units,
-so we're willing to make exact checks for offsets and such.
-
-CHECK-LABEL: .debug_info contents:
-
-The v4 CU header.
-
-CHECK: 0x00000000: Compile Unit: length = 0x00000015 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000019)
-CHECK: 0x0000000b: DW_TAG_compile_unit
-
-The v5 normal CU header.
-
-CHECK: 0x00000019: Compile Unit: length = 0x00000016 version = 0x0005 unit_type = DW_UT_compile abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000033)
-CHECK: 0x00000025: DW_TAG_compile_unit
-
-CHECK-LABEL: .debug_types contents:
-
-The v4 type unit header.
-
-CHECK: 0x00000000: Type Unit: length = 0x0000001f version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 name = 'V4_type_unit' type_signature = 0x0011223344556677 type_offset = 0x001c (next unit at 0x00000023)
-CHECK: 0x00000017: DW_TAG_type_unit
-
-FIXME: DWARF v5 wants type units in .debug_info[.dwo] not .debug_types[.dwo].
-CHECK: .debug_types.dwo contents:
-
-CHECK: 0x00000000: Type Unit: length = 0x00000020 version = 0x0005 unit_type = DW_UT_split_type abbr_offset = 0x0000 addr_size = 0x08 name = 'V5_split_type_unit' type_signature = 0x8899aabbccddeeff type_offset = 0x001d (next unit at 0x00000024)
-CHECK: 0x00000018: DW_TAG_type_unit
-
-CHECK-LABEL: .debug_line contents:
-
-The v4 line table header.
-
-CHECK: Line table prologue:
-CHECK: version: 4
-CHECK-NOT: address_size
-CHECK-NOT: seg_select_size
-CHECK: max_ops_per_inst: 1
-CHECK: include_directories[ 1] = 'Directory4a'
-CHECK: include_directories[ 2] = 'Directory4b'
-CHECK-NOT: include_directories
-CHECK: file_names[ 1] 1 0x00000041 0x00000042 File4a{{$}}
-CHECK: file_names[ 2] 0 0x00000043 0x00000044 File4b{{$}}
-CHECK-NOT: file_names
-
-The v5 line table header.
-
-CHECK: Line table prologue:
-CHECK: version: 5
-CHECK: address_size: 8
-CHECK: seg_select_size: 0
-CHECK: max_ops_per_inst: 1
-CHECK: include_directories[ 1] = 'Directory5a'
-CHECK: include_directories[ 2] = 'Directory5b'
-CHECK-NOT: include_directories
-CHECK: file_names[ 1] 1 0x00000051 0x00000052 File5a{{$}}
-CHECK: file_names[ 2] 2 0x00000053 0x00000054 File5b{{$}}
-CHECK-NOT: file_names
diff --git a/test/FileCheck/defines.txt b/test/FileCheck/defines.txt
new file mode 100644
index 00000000000..d2219b7ca25
--- /dev/null
+++ b/test/FileCheck/defines.txt
@@ -0,0 +1,9 @@
+; RUN: FileCheck -DVALUE=10 -input-file %s %s
+; RUN: not FileCheck -DVALUE=20 -input-file %s %s 2>&1 | FileCheck %s -check-prefix ERRMSG
+
+Value = 10
+; CHECK: Value = [[VALUE]]
+
+; ERRMSG: defines.txt:5:10: error: expected string not found in input
+; ERRMSG: defines.txt:1:1: note: with variable "VALUE" equal to "20"
+; ERRMSG: defines.txt:4:1: note: possible intended match here
diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll b/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll
index c3c2435fc87..1fc20febc94 100644
--- a/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll
@@ -39,8 +39,7 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK: [[B]]:
; CHECK-NEXT: popfq
-; CHECK: rep
-; CHECK-NEXT: movsb (%rsi), %es:(%rdi)
+; CHECK: rep movsb (%rsi), %es:(%rdi)
; Function Attrs: nounwind sanitize_address uwtable
define void @rep_movs_1b(i8* %dst, i8* %src, i64 %n) #0 {
@@ -73,8 +72,7 @@ entry:
; CHECK: [[Q]]:
; CHECK-NEXT: popfq
-; CHECK: rep
-; CHECK-NEXT: movsq (%rsi), %es:(%rdi)
+; CHECK: rep movsq (%rsi), %es:(%rdi)
; Function Attrs: nounwind sanitize_address uwtable
define void @rep_movs_8b(i64* %dst, i64* %src, i64 %n) #0 {
diff --git a/test/LTO/Resolution/X86/comdat-mixed-lto.ll b/test/LTO/Resolution/X86/comdat-mixed-lto.ll
index f6ee22e4161..d6022c64351 100644
--- a/test/LTO/Resolution/X86/comdat-mixed-lto.ll
+++ b/test/LTO/Resolution/X86/comdat-mixed-lto.ll
@@ -17,7 +17,7 @@
; would clash with the copy from this module.
; RUN: llvm-dis %t3.0.0.preopt.bc -o - | FileCheck %s
; CHECK: define internal void @__cxx_global_var_init() section ".text.startup" {
-; CHECK: define available_externally void @testglobfunc() section ".text.startup" {
+; CHECK: define available_externally dso_local void @testglobfunc() section ".text.startup" {
; ModuleID = 'comdat-mixed-lto.o'
source_filename = "comdat-mixed-lto.cpp"
diff --git a/test/LTO/Resolution/X86/comdat.ll b/test/LTO/Resolution/X86/comdat.ll
index 60d082b3e0f..94f28384231 100644
--- a/test/LTO/Resolution/X86/comdat.ll
+++ b/test/LTO/Resolution/X86/comdat.ll
@@ -70,14 +70,14 @@ bb11:
; CHECK-DAG: @a23 = alias i32 (i8*), i32 (i8*)* @f1.2{{$}}
; CHECK-DAG: @a24 = alias i16, bitcast (i32 (i8*)* @f1.2 to i16*)
-; CHECK: define weak_odr i32 @f1(i8*) comdat($c1) {
+; CHECK: define weak_odr dso_local i32 @f1(i8*) comdat($c1) {
; CHECK-NEXT: bb10:
; CHECK-NEXT: br label %bb11{{$}}
; CHECK: bb11:
; CHECK-NEXT: ret i32 42
; CHECK-NEXT: }
-; CHECK: define internal i32 @f1.2(i8* %this) comdat($c2) {
+; CHECK: define internal dso_local i32 @f1.2(i8* %this) comdat($c2) {
; CHECK-NEXT: bb20:
; CHECK-NEXT: store i8* %this, i8** null
; CHECK-NEXT: br label %bb21
diff --git a/test/LTO/Resolution/X86/commons.ll b/test/LTO/Resolution/X86/commons.ll
index 28bf1ada4a8..8adfb87d6ed 100644
--- a/test/LTO/Resolution/X86/commons.ll
+++ b/test/LTO/Resolution/X86/commons.ll
@@ -4,7 +4,7 @@
; RUN: llvm-dis -o - %t.out.0.0.preopt.bc | FileCheck %s
; A strong definition should override the common
-; CHECK: @x = global i32 42, align 4
+; CHECK: @x = dso_local global i32 42, align 4
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/MC/AArch64/SVE/assembler_tests/add.s b/test/MC/AArch64/SVE/assembler_tests/add.s
new file mode 100644
index 00000000000..7906dbbaf88
--- /dev/null
+++ b/test/MC/AArch64/SVE/assembler_tests/add.s
@@ -0,0 +1,66 @@
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -mattr=+sve < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -mattr=-sve 2>&1 < %s | FileCheck --check-prefix=CHECK-ERROR %s
+add z31.s, z31.s, z31.s // 00000100-10111111-00000011-11111111
+// CHECK: add z31.s, z31.s, z31.s // encoding: [0xff,0x03,0xbf,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10111111-00000011-11111111
+add z23.d, z13.d, z8.d // 00000100-11101000-00000001-10110111
+// CHECK: add z23.d, z13.d, z8.d // encoding: [0xb7,0x01,0xe8,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11101000-00000001-10110111
+add z0.s, z0.s, z0.s // 00000100-10100000-00000000-00000000
+// CHECK: add z0.s, z0.s, z0.s // encoding: [0x00,0x00,0xa0,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10100000-00000000-00000000
+add z31.d, z31.d, z31.d // 00000100-11111111-00000011-11111111
+// CHECK: add z31.d, z31.d, z31.d // encoding: [0xff,0x03,0xff,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11111111-00000011-11111111
+add z21.b, z10.b, z21.b // 00000100-00110101-00000001-01010101
+// CHECK: add z21.b, z10.b, z21.b // encoding: [0x55,0x01,0x35,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00110101-00000001-01010101
+add z31.b, z31.b, z31.b // 00000100-00111111-00000011-11111111
+// CHECK: add z31.b, z31.b, z31.b // encoding: [0xff,0x03,0x3f,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00111111-00000011-11111111
+add z0.h, z0.h, z0.h // 00000100-01100000-00000000-00000000
+// CHECK: add z0.h, z0.h, z0.h // encoding: [0x00,0x00,0x60,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01100000-00000000-00000000
+add z23.b, z13.b, z8.b // 00000100-00101000-00000001-10110111
+// CHECK: add z23.b, z13.b, z8.b // encoding: [0xb7,0x01,0x28,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00101000-00000001-10110111
+add z0.d, z0.d, z0.d // 00000100-11100000-00000000-00000000
+// CHECK: add z0.d, z0.d, z0.d // encoding: [0x00,0x00,0xe0,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11100000-00000000-00000000
+add z31.h, z31.h, z31.h // 00000100-01111111-00000011-11111111
+// CHECK: add z31.h, z31.h, z31.h // encoding: [0xff,0x03,0x7f,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01111111-00000011-11111111
+add z0.b, z0.b, z0.b // 00000100-00100000-00000000-00000000
+// CHECK: add z0.b, z0.b, z0.b // encoding: [0x00,0x00,0x20,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00100000-00000000-00000000
+add z21.d, z10.d, z21.d // 00000100-11110101-00000001-01010101
+// CHECK: add z21.d, z10.d, z21.d // encoding: [0x55,0x01,0xf5,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11110101-00000001-01010101
+add z21.h, z10.h, z21.h // 00000100-01110101-00000001-01010101
+// CHECK: add z21.h, z10.h, z21.h // encoding: [0x55,0x01,0x75,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01110101-00000001-01010101
+add z21.s, z10.s, z21.s // 00000100-10110101-00000001-01010101
+// CHECK: add z21.s, z10.s, z21.s // encoding: [0x55,0x01,0xb5,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10110101-00000001-01010101
+add z23.h, z13.h, z8.h // 00000100-01101000-00000001-10110111
+// CHECK: add z23.h, z13.h, z8.h // encoding: [0xb7,0x01,0x68,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01101000-00000001-10110111
+add z23.s, z13.s, z8.s // 00000100-10101000-00000001-10110111
+// CHECK: add z23.s, z13.s, z8.s // encoding: [0xb7,0x01,0xa8,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10101000-00000001-10110111
diff --git a/test/MC/AArch64/SVE/assembler_tests/sub.s b/test/MC/AArch64/SVE/assembler_tests/sub.s
new file mode 100644
index 00000000000..ee283afdb7f
--- /dev/null
+++ b/test/MC/AArch64/SVE/assembler_tests/sub.s
@@ -0,0 +1,66 @@
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -mattr=+sve < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -mattr=-sve 2>&1 < %s | FileCheck --check-prefix=CHECK-ERROR %s
+sub z0.h, z0.h, z0.h // 00000100-01100000-00000100-00000000
+// CHECK: sub z0.h, z0.h, z0.h // encoding: [0x00,0x04,0x60,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01100000-00000100-00000000
+sub z21.b, z10.b, z21.b // 00000100-00110101-00000101-01010101
+// CHECK: sub z21.b, z10.b, z21.b // encoding: [0x55,0x05,0x35,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00110101-00000101-01010101
+sub z31.h, z31.h, z31.h // 00000100-01111111-00000111-11111111
+// CHECK: sub z31.h, z31.h, z31.h // encoding: [0xff,0x07,0x7f,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01111111-00000111-11111111
+sub z21.h, z10.h, z21.h // 00000100-01110101-00000101-01010101
+// CHECK: sub z21.h, z10.h, z21.h // encoding: [0x55,0x05,0x75,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01110101-00000101-01010101
+sub z31.b, z31.b, z31.b // 00000100-00111111-00000111-11111111
+// CHECK: sub z31.b, z31.b, z31.b // encoding: [0xff,0x07,0x3f,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00111111-00000111-11111111
+sub z0.s, z0.s, z0.s // 00000100-10100000-00000100-00000000
+// CHECK: sub z0.s, z0.s, z0.s // encoding: [0x00,0x04,0xa0,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10100000-00000100-00000000
+sub z23.b, z13.b, z8.b // 00000100-00101000-00000101-10110111
+// CHECK: sub z23.b, z13.b, z8.b // encoding: [0xb7,0x05,0x28,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00101000-00000101-10110111
+sub z21.d, z10.d, z21.d // 00000100-11110101-00000101-01010101
+// CHECK: sub z21.d, z10.d, z21.d // encoding: [0x55,0x05,0xf5,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11110101-00000101-01010101
+sub z21.s, z10.s, z21.s // 00000100-10110101-00000101-01010101
+// CHECK: sub z21.s, z10.s, z21.s // encoding: [0x55,0x05,0xb5,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10110101-00000101-01010101
+sub z0.b, z0.b, z0.b // 00000100-00100000-00000100-00000000
+// CHECK: sub z0.b, z0.b, z0.b // encoding: [0x00,0x04,0x20,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00100000-00000100-00000000
+sub z23.d, z13.d, z8.d // 00000100-11101000-00000101-10110111
+// CHECK: sub z23.d, z13.d, z8.d // encoding: [0xb7,0x05,0xe8,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11101000-00000101-10110111
+sub z23.s, z13.s, z8.s // 00000100-10101000-00000101-10110111
+// CHECK: sub z23.s, z13.s, z8.s // encoding: [0xb7,0x05,0xa8,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10101000-00000101-10110111
+sub z31.d, z31.d, z31.d // 00000100-11111111-00000111-11111111
+// CHECK: sub z31.d, z31.d, z31.d // encoding: [0xff,0x07,0xff,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11111111-00000111-11111111
+sub z23.h, z13.h, z8.h // 00000100-01101000-00000101-10110111
+// CHECK: sub z23.h, z13.h, z8.h // encoding: [0xb7,0x05,0x68,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01101000-00000101-10110111
+sub z0.d, z0.d, z0.d // 00000100-11100000-00000100-00000000
+// CHECK: sub z0.d, z0.d, z0.d // encoding: [0x00,0x04,0xe0,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11100000-00000100-00000000
+sub z31.s, z31.s, z31.s // 00000100-10111111-00000111-11111111
+// CHECK: sub z31.s, z31.s, z31.s // encoding: [0xff,0x07,0xbf,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10111111-00000111-11111111
diff --git a/test/MC/AArch64/SVE/disassembler_tests/add.s b/test/MC/AArch64/SVE/disassembler_tests/add.s
new file mode 100644
index 00000000000..22a61fb4a84
--- /dev/null
+++ b/test/MC/AArch64/SVE/disassembler_tests/add.s
@@ -0,0 +1,50 @@
+# RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -disassemble -mattr=+sve < %s | FileCheck %s
+# RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -disassemble -mattr=-sve 2>&1 < %s | FileCheck --check-prefix=CHECK-ERROR %s
+0xff,0x03,0xbf,0x04
+# CHECK: add z31.s, z31.s, z31.s // encoding: [0xff,0x03,0xbf,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x01,0xe8,0x04
+# CHECK: add z23.d, z13.d, z8.d // encoding: [0xb7,0x01,0xe8,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x00,0x00,0xa0,0x04
+# CHECK: add z0.s, z0.s, z0.s // encoding: [0x00,0x00,0xa0,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xff,0x03,0xff,0x04
+# CHECK: add z31.d, z31.d, z31.d // encoding: [0xff,0x03,0xff,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x01,0x35,0x04
+# CHECK: add z21.b, z10.b, z21.b // encoding: [0x55,0x01,0x35,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xff,0x03,0x3f,0x04
+# CHECK: add z31.b, z31.b, z31.b // encoding: [0xff,0x03,0x3f,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x00,0x00,0x60,0x04
+# CHECK: add z0.h, z0.h, z0.h // encoding: [0x00,0x00,0x60,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x01,0x28,0x04
+# CHECK: add z23.b, z13.b, z8.b // encoding: [0xb7,0x01,0x28,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x00,0x00,0xe0,0x04
+# CHECK: add z0.d, z0.d, z0.d // encoding: [0x00,0x00,0xe0,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xff,0x03,0x7f,0x04
+# CHECK: add z31.h, z31.h, z31.h // encoding: [0xff,0x03,0x7f,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x00,0x00,0x20,0x04
+# CHECK: add z0.b, z0.b, z0.b // encoding: [0x00,0x00,0x20,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x01,0xf5,0x04
+# CHECK: add z21.d, z10.d, z21.d // encoding: [0x55,0x01,0xf5,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x01,0x75,0x04
+# CHECK: add z21.h, z10.h, z21.h // encoding: [0x55,0x01,0x75,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x01,0xb5,0x04
+# CHECK: add z21.s, z10.s, z21.s // encoding: [0x55,0x01,0xb5,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x01,0x68,0x04
+# CHECK: add z23.h, z13.h, z8.h // encoding: [0xb7,0x01,0x68,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x01,0xa8,0x04
+# CHECK: add z23.s, z13.s, z8.s // encoding: [0xb7,0x01,0xa8,0x04]
+# CHECK-ERROR: invalid instruction encoding
diff --git a/test/MC/AArch64/SVE/disassembler_tests/sub.s b/test/MC/AArch64/SVE/disassembler_tests/sub.s
new file mode 100644
index 00000000000..e7acde952a7
--- /dev/null
+++ b/test/MC/AArch64/SVE/disassembler_tests/sub.s
@@ -0,0 +1,50 @@
+# RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -disassemble -mattr=+sve < %s | FileCheck %s
+# RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -disassemble -mattr=-sve 2>&1 < %s | FileCheck --check-prefix=CHECK-ERROR %s
+0x00,0x04,0x60,0x04
+# CHECK: sub z0.h, z0.h, z0.h // encoding: [0x00,0x04,0x60,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x05,0x35,0x04
+# CHECK: sub z21.b, z10.b, z21.b // encoding: [0x55,0x05,0x35,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xff,0x07,0x7f,0x04
+# CHECK: sub z31.h, z31.h, z31.h // encoding: [0xff,0x07,0x7f,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x05,0x75,0x04
+# CHECK: sub z21.h, z10.h, z21.h // encoding: [0x55,0x05,0x75,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xff,0x07,0x3f,0x04
+# CHECK: sub z31.b, z31.b, z31.b // encoding: [0xff,0x07,0x3f,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x00,0x04,0xa0,0x04
+# CHECK: sub z0.s, z0.s, z0.s // encoding: [0x00,0x04,0xa0,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x05,0x28,0x04
+# CHECK: sub z23.b, z13.b, z8.b // encoding: [0xb7,0x05,0x28,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x05,0xf5,0x04
+# CHECK: sub z21.d, z10.d, z21.d // encoding: [0x55,0x05,0xf5,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x05,0xb5,0x04
+# CHECK: sub z21.s, z10.s, z21.s // encoding: [0x55,0x05,0xb5,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x00,0x04,0x20,0x04
+# CHECK: sub z0.b, z0.b, z0.b // encoding: [0x00,0x04,0x20,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x05,0xe8,0x04
+# CHECK: sub z23.d, z13.d, z8.d // encoding: [0xb7,0x05,0xe8,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x05,0xa8,0x04
+# CHECK: sub z23.s, z13.s, z8.s // encoding: [0xb7,0x05,0xa8,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xff,0x07,0xff,0x04
+# CHECK: sub z31.d, z31.d, z31.d // encoding: [0xff,0x07,0xff,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x05,0x68,0x04
+# CHECK: sub z23.h, z13.h, z8.h // encoding: [0xb7,0x05,0x68,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x00,0x04,0xe0,0x04
+# CHECK: sub z0.d, z0.d, z0.d // encoding: [0x00,0x04,0xe0,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xff,0x07,0xbf,0x04
+# CHECK: sub z31.s, z31.s, z31.s // encoding: [0xff,0x07,0xbf,0x04]
+# CHECK-ERROR: invalid instruction encoding
diff --git a/test/MC/Disassembler/Mips/micromips32r3/valid-el.txt b/test/MC/Disassembler/Mips/micromips32r3/valid-el.txt
index dc76f48a95a..0cd74f5ba71 100644
--- a/test/MC/Disassembler/Mips/micromips32r3/valid-el.txt
+++ b/test/MC/Disassembler/Mips/micromips32r3/valid-el.txt
@@ -27,6 +27,7 @@
0x09 0x46 # CHECK: mfhi $9
0x49 0x46 # CHECK: mflo $9
0x21 0x0f # CHECK: move $25, $1
+0x9a 0x85 # CHECK: movep $4, $21, $18, $17
0xa9 0x45 # CHECK: jrc $9
0xc9 0x45 # CHECK: jalr $9
0xe9 0x45 # CHECK: jalrs16 $9
diff --git a/test/MC/Disassembler/Mips/micromips32r3/valid.txt b/test/MC/Disassembler/Mips/micromips32r3/valid.txt
index 38d6897e1c4..dbab070b874 100644
--- a/test/MC/Disassembler/Mips/micromips32r3/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips32r3/valid.txt
@@ -27,6 +27,7 @@
0x46 0x09 # CHECK: mfhi $9
0x46 0x49 # CHECK: mflo $9
0x0f 0x21 # CHECK: move $25, $1
+0x85 0x9a # CHECK: movep $4, $21, $18, $17
0x45 0xa9 # CHECK: jrc $9
0x45 0xc9 # CHECK: jalr $9
0x45 0xe9 # CHECK: jalrs16 $9
diff --git a/test/MC/Disassembler/Mips/micromips32r6/valid.txt b/test/MC/Disassembler/Mips/micromips32r6/valid.txt
index f32f2532c24..462866d3347 100644
--- a/test/MC/Disassembler/Mips/micromips32r6/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips32r6/valid.txt
@@ -21,7 +21,7 @@
0x29 0x82 # CHECK: lhu16 $3, 4($16)
0x09 0x94 # CHECK: lbu16 $3, 4($17)
0x09 0x9f # CHECK: lbu16 $3, -1($17)
-0x84 0x34 # CHECK: movep $5, $6, $2, $3
+0x44 0x36 # CHECK: movep $5, $6, $2, $3
0x04 0xcc # CHECK: addu16 $6, $17, $4
0x44 0x21 # CHECK: and16 $16, $2
0x2e 0x56 # CHECK: andi16 $4, $5, 8
diff --git a/test/MC/Disassembler/Mips/micromips64r6/valid.txt b/test/MC/Disassembler/Mips/micromips64r6/valid.txt
index 9186e66d4d0..07cea0d77c5 100644
--- a/test/MC/Disassembler/Mips/micromips64r6/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips64r6/valid.txt
@@ -23,7 +23,7 @@
0x45 0x2b # CHECK: jalr $9
0x45 0x23 # CHECK: jrc16 $9
0x44 0xb3 # CHECK: jrcaddiusp 20
-0x84 0x34 # CHECK: movep $5, $6, $2, $3
+0x44 0x36 # CHECK: movep $5, $6, $2, $3
0x45 0xf9 # CHECK: or16 $3, $7
0x60 0x44 0x30 0x08 # CHECK: ll $2, 8($4)
0x20 0x44 0x50 0x08 # CHECK: lwm32 $16, $17, 8($4)
diff --git a/test/MC/Disassembler/X86/prefixes-i386.txt b/test/MC/Disassembler/X86/prefixes-i386.txt
index ff2fb223873..3152cc31aad 100644
--- a/test/MC/Disassembler/X86/prefixes-i386.txt
+++ b/test/MC/Disassembler/X86/prefixes-i386.txt
@@ -3,85 +3,59 @@
# CHECK: movl %fs:24, %eax
0x64 0xa1 0x18 0x00 0x00 0x00 # mov eax, dword ptr fs:[18h]
-# CHECK: rep
-# CHECK-NEXT: insb %dx, %es:(%edi)
+# CHECK: rep insb %dx, %es:(%edi)
0xf3 0x6c #rep ins
-# CHECK: rep
-# CHECK-NEXT: insl %dx, %es:(%edi)
+# CHECK: rep insl %dx, %es:(%edi)
0xf3 0x6d #rep ins
-# CHECK: rep
-# CHECK-NEXT: movsb (%esi), %es:(%edi)
+# CHECK: rep movsb (%esi), %es:(%edi)
0xf3 0xa4 #rep movs
-# CHECK: rep
-# CHECK-NEXT: movsl (%esi), %es:(%edi)
+# CHECK: rep movsl (%esi), %es:(%edi)
0xf3 0xa5 #rep movs
-# CHECK: rep
-# CHECK-NEXT: outsb (%esi), %dx
+# CHECK: rep outsb (%esi), %dx
0xf3 0x6e #rep outs
-# CHECK: rep
-# CHECK-NEXT: outsl (%esi), %dx
+# CHECK: rep outsl (%esi), %dx
0xf3 0x6f #rep outs
-# CHECK: rep
-# CHECK-NEXT: lodsb (%esi), %al
+# CHECK: rep lodsb (%esi), %al
0xf3 0xac #rep lods
-# CHECK: rep
-# CHECK-NEXT: lodsl (%esi), %eax
+# CHECK: rep lodsl (%esi), %eax
0xf3 0xad #rep lods
-# CHECK: rep
-# CHECK-NEXT: stosb %al, %es:(%edi)
+# CHECK: rep stosb %al, %es:(%edi)
0xf3 0xaa #rep stos
-# CHECK: rep
-# CHECK-NEXT: stosl %eax, %es:(%edi)
+# CHECK: rep stosl %eax, %es:(%edi)
0xf3 0xab #rep stos
-# CHECK: rep
-# CHECK-NEXT: cmpsb %es:(%edi), (%esi)
+# CHECK: rep cmpsb %es:(%edi), (%esi)
0xf3 0xa6 #rep cmps
-# CHECK: rep
-# CHECK-NEXT: cmpsl %es:(%edi), (%esi)
+# CHECK: rep cmpsl %es:(%edi), (%esi)
0xf3 0xa7 #repe cmps
-# CHECK: rep
-# CHECK-NEXT: scasb %es:(%edi), %al
+# CHECK: rep scasb %es:(%edi), %al
0xf3 0xae #repe scas
-# CHECK: rep
-# CHECK-NEXT: scasl %es:(%edi), %eax
+# CHECK: rep scasl %es:(%edi), %eax
0xf3 0xaf #repe scas
-# CHECK: repne
-# CHECK-NEXT: cmpsb %es:(%edi), (%esi)
+# CHECK: repne cmpsb %es:(%edi), (%esi)
0xf2 0xa6 #repne cmps
-# CHECK: repne
-# CHECK-NEXT: cmpsl %es:(%edi), (%esi)
+# CHECK: repne cmpsl %es:(%edi), (%esi)
0xf2 0xa7 #repne cmps
-# CHECK: repne
-# CHECK-NEXT: scasb %es:(%edi), %al
+# CHECK: repne scasb %es:(%edi), %al
0xf2 0xae #repne scas
-# CHECK: repne
-# CHECK-NEXT: scasl %es:(%edi), %eax
+# CHECK: repne scasl %es:(%edi), %eax
0xf2 0xaf #repne scas
-# CHECK: repne
-# CHECK-NEXT: scasw %es:(%edi), %ax
+# CHECK: repne scasw %es:(%edi), %ax
0xf2 0x66 0xaf
-# CHECK: repne
-# CHECK-NEXT: scasw %es:(%edi), %ax
+# CHECK: repne scasw %es:(%edi), %ax
0x66 0xf2 0xaf
-# CHECK: rep
-# CHECK-NEXT: scasw %es:(%edi), %ax
+# CHECK: rep scasw %es:(%edi), %ax
0xf3 0x66 0xaf
-# CHECK: rep
-# CHECK-NEXT: scasw %es:(%edi), %ax
+# CHECK: rep scasw %es:(%edi), %ax
0x66 0xf3 0xaf
-# CHECK: repne
-# CHECK: insw %dx, %es:(%edi)
+# CHECK: repne insw %dx, %es:(%edi)
0xf2 0x66 0x6d
-# CHECK: repne
-# CHECK: insw %dx, %es:(%edi)
+# CHECK: repne insw %dx, %es:(%edi)
0x66 0xf2 0x6d
-# CHECK: rep
-# CHECK: insw %dx, %es:(%edi)
+# CHECK: rep insw %dx, %es:(%edi)
0xf3 0x66 0x6d
-# CHECK: rep
-# CHECK: insw %dx, %es:(%edi)
+# CHECK: rep insw %dx, %es:(%edi)
0x66 0xf3 0x6d
diff --git a/test/MC/Disassembler/X86/prefixes-x86_64.txt b/test/MC/Disassembler/X86/prefixes-x86_64.txt
index 7a9208f7b63..c9bf512aa75 100644
--- a/test/MC/Disassembler/X86/prefixes-x86_64.txt
+++ b/test/MC/Disassembler/X86/prefixes-x86_64.txt
@@ -9,30 +9,22 @@
# CHECK: mulsd %xmm7, %xmm7
0xf2 0x66 0x0f 0x59 0xff
-# CHECK: repne
-# CHECK-NEXT: scasw %es:(%rdi), %ax
+# CHECK: repne scasw %es:(%rdi), %ax
0xf2 0x66 0xaf
-# CHECK: rep
-# CHECK-NEXT: scasw %es:(%rdi), %ax
+# CHECK: repne scasw %es:(%rdi), %ax
0x66 0xf2 0xaf
-# CHECK: rep
-# CHECK-NEXT: scasw %es:(%rdi), %ax
+# CHECK: rep scasw %es:(%rdi), %ax
0xf3 0x66 0xaf
-# CHECK: rep
-# CHECK-NEXT: scasw %es:(%rdi), %ax
+# CHECK: rep scasw %es:(%rdi), %ax
0x66 0xf3 0xaf
-# CHECK: repne
-# CHECK: insw %dx, %es:(%rdi)
+# CHECK: repne insw %dx, %es:(%rdi)
0xf2 0x66 0x6d
-# CHECK: repne
-# CHECK: insw %dx, %es:(%rdi)
+# CHECK: repne insw %dx, %es:(%rdi)
0x66 0xf2 0x6d
-# CHECK: rep
-# CHECK: insw %dx, %es:(%rdi)
+# CHECK: rep insw %dx, %es:(%rdi)
0xf3 0x66 0x6d
-# CHECK: rep
-# CHECK: insw %dx, %es:(%rdi)
+# CHECK: rep insw %dx, %es:(%rdi)
0x66 0xf3 0x6d
diff --git a/test/MC/Disassembler/X86/prefixes.txt b/test/MC/Disassembler/X86/prefixes.txt
index 983e09670d6..75e11ae93f4 100644
--- a/test/MC/Disassembler/X86/prefixes.txt
+++ b/test/MC/Disassembler/X86/prefixes.txt
@@ -1,73 +1,53 @@
# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s
-# CHECK: rep
-# CHECK-NEXT: insb %dx, %es:(%rdi)
+# CHECK: rep insb %dx, %es:(%rdi)
0xf3 0x6c #rep ins
-# CHECK: rep
-# CHECK-NEXT: insl %dx, %es:(%rdi)
+# CHECK: rep insl %dx, %es:(%rdi)
0xf3 0x6d #rep ins
-# CHECK: rep
-# CHECK-NEXT: movsb (%rsi), %es:(%rdi)
+# CHECK: rep movsb (%rsi), %es:(%rdi)
0xf3 0xa4 #rep movs
-# CHECK: rep
-# CHECK-NEXT: movsl (%rsi), %es:(%rdi)
+# CHECK: rep movsl (%rsi), %es:(%rdi)
0xf3 0xa5 #rep movs
-# CHECK: rep
-# CHECK-NEXT: outsb (%rsi), %dx
+# CHECK: rep outsb (%rsi), %dx
0xf3 0x6e #rep outs
-# CHECK: rep
-# CHECK-NEXT: outsl (%rsi), %dx
+# CHECK: rep outsl (%rsi), %dx
0xf3 0x6f #rep outs
-# CHECK: rep
-# CHECK-NEXT: lodsb (%rsi), %al
+# CHECK: rep lodsb (%rsi), %al
0xf3 0xac #rep lods
-# CHECK: rep
-# CHECK-NEXT: lodsl (%rsi), %eax
+# CHECK: rep lodsl (%rsi), %eax
0xf3 0xad #rep lods
-# CHECK: rep
-# CHECK-NEXT: stosb %al, %es:(%rdi)
+# CHECK: rep stosb %al, %es:(%rdi)
0xf3 0xaa #rep stos
-# CHECK: rep
-# CHECK-NEXT: stosl %eax, %es:(%rdi)
+# CHECK: rep stosl %eax, %es:(%rdi)
0xf3 0xab #rep stos
-# CHECK: rep
-# CHECK-NEXT: cmpsb %es:(%rdi), (%rsi)
+# CHECK: rep cmpsb %es:(%rdi), (%rsi)
0xf3 0xa6 #rep cmps
-# CHECK: rep
-# CHECK-NEXT: cmpsl %es:(%rdi), (%rsi)
+# CHECK: rep cmpsl %es:(%rdi), (%rsi)
0xf3 0xa7 #repe cmps
-# CHECK: rep
-# CHECK-NEXT: scasb %es:(%rdi), %al
+# CHECK: rep scasb %es:(%rdi), %al
0xf3 0xae #repe scas
-# CHECK: rep
-# CHECK-NEXT: scasl %es:(%rdi), %eax
+# CHECK: rep scasl %es:(%rdi), %eax
0xf3 0xaf #repe scas
-# CHECK: repne
-# CHECK-NEXT: cmpsb %es:(%rdi), (%rsi)
+# CHECK: repne cmpsb %es:(%rdi), (%rsi)
0xf2 0xa6 #repne cmps
-# CHECK: repne
-# CHECK-NEXT: cmpsl %es:(%rdi), (%rsi)
+# CHECK: repne cmpsl %es:(%rdi), (%rsi)
0xf2 0xa7 #repne cmps
-# CHECK: repne
-# CHECK-NEXT: scasb %es:(%rdi), %al
+# CHECK: repne scasb %es:(%rdi), %al
0xf2 0xae #repne scas
-# CHECK: repne
-# CHECK-NEXT: scasl %es:(%rdi), %eax
+# CHECK: repne scasl %es:(%rdi), %eax
0xf2 0xaf #repne scas
# CHECK: lock
-# CHECK-NEXT: orl $16, %fs:776
+# CHECK-NEXT: orl $16, %fs:776
0xf0 0x64 0x83 0x0c 0x25 0x08 0x03 0x00 0x00 0x10
# CHECK: movq %fs:768, %rdi
0x64 0x48 0x8b 0x3c 0x25 0x00 0x03 0x00 0x00
-# CHECK: rep
-# CHECK-NEXT: stosq %rax, %es:(%rdi)
+# CHECK: rep stosq %rax, %es:(%rdi)
0xf3 0x48 0xab
-# CHECK: rep
-# CHECK-NEXT: stosq %rax, %es:(%edi)
+# CHECK: rep stosq %rax, %es:(%edi)
0xf3 0x67 0x48 0xab
# CHECK: movl 32(%rbp), %eax
@@ -104,11 +84,9 @@
0x66,0x83,0xc0,0xf4
# Test that multiple redundant prefixes work (redundant, but valid x86).
-# CHECK: rep
-# CHECK-NEXT: stosq
+# CHECK: rep stosq
0xf3 0xf3 0x48 0xab
-
# Test that we can disassembler control registers above CR8
# CHECK: movq %cr15, %rax
0x44 0x0f 0x20 0xf8
diff --git a/test/MC/Disassembler/X86/simple-tests.txt b/test/MC/Disassembler/X86/simple-tests.txt
index 86d9f92fbbf..39074934164 100644
--- a/test/MC/Disassembler/X86/simple-tests.txt
+++ b/test/MC/Disassembler/X86/simple-tests.txt
@@ -851,14 +851,11 @@
0xf0 0x48 0x0f 0xc1 0xcb
# rdar://13493622 lldb doesn't print the x86 rep/repne prefix when disassembling
-# CHECK: repne
-# CHECK-NEXT: movsl
+# CHECK: repne movsl
0xf2 0xa5
-# CHECK: repne
-# CHECK-NEXT: movsq
+# CHECK: repne movsq
0xf2 0x48 0xa5
-# CHECK: repne
-# CHECK-NEXT: movb $0, (%rax)
+# CHECK: repne movb $0, (%rax)
0xf2 0xc6 0x0 0x0
# rdar://11019859 Support 2013 Haswell RTM instructions and HLE prefixes
diff --git a/test/MC/Mips/micromips32r6/valid.s b/test/MC/Mips/micromips32r6/valid.s
index 66fcf72ec7f..b47924453cb 100644
--- a/test/MC/Mips/micromips32r6/valid.s
+++ b/test/MC/Mips/micromips32r6/valid.s
@@ -84,7 +84,7 @@
lwm32 $16, $17, $18, $19, $20, $21, $22, $23, $fp, 8($4) # CHECK: lwm32 $16, $17, $18, $19, $20, $21, $22, $23, $fp, 8($4) # encoding: [0x21,0x24,0x50,0x08]
lwm32 $16, $17, $18, $19, $20, $21, $22, $23, $fp, $ra, 8($4) # CHECK: lwm32 $16, $17, $18, $19, $20, $21, $22, $23, $fp, $ra, 8($4) # encoding: [0x23,0x24,0x50,0x08]
lwm32 $16, $17, $18, $19, $20, $21, $22, $23, $fp, $ra, 8($4) # CHECK: lwm32 $16, $17, $18, $19, $20, $21, $22, $23, $fp, $ra, 8($4) # encoding: [0x23,0x24,0x50,0x08]
- movep $5, $6, $2, $3 # CHECK: movep $5, $6, $2, $3 # encoding: [0x84,0x34]
+ movep $5, $6, $2, $3 # CHECK: movep $5, $6, $2, $3 # encoding: [0x44,0x36]
rotr $2, 7 # CHECK: rotr $2, $2, 7 # encoding: [0x00,0x42,0x38,0xc0]
rotr $9, $6, 7 # CHECK: rotr $9, $6, 7 # encoding: [0x01,0x26,0x38,0xc0]
rotrv $9, $6, $7 # CHECK: rotrv $9, $6, $7 # encoding: [0x00,0xc7,0x48,0xd0]
diff --git a/test/MC/Mips/micromips64r6/valid.s b/test/MC/Mips/micromips64r6/valid.s
index 641e16c1457..a2acedb03c0 100644
--- a/test/MC/Mips/micromips64r6/valid.s
+++ b/test/MC/Mips/micromips64r6/valid.s
@@ -35,7 +35,7 @@ a:
lhu16 $3, 4($16) # CHECK: lhu16 $3, 4($16) # encoding: [0x29,0x82]
lbu16 $3, 4($17) # CHECK: lbu16 $3, 4($17) # encoding: [0x09,0x94]
lbu16 $3, -1($17) # CHECK: lbu16 $3, -1($17) # encoding: [0x09,0x9f]
- movep $5, $6, $2, $3 # CHECK: movep $5, $6, $2, $3 # encoding: [0x84,0x34]
+ movep $5, $6, $2, $3 # CHECK: movep $5, $6, $2, $3 # encoding: [0x44,0x36]
not16 $4, $7 # CHECK: not16 $4, $7 # encoding: [0x46,0x70]
or16 $3, $7 # CHECK: or16 $3, $7 # encoding: [0x45,0xf9]
ll $2, 8($4) # CHECK: ll $2, 8($4) # encoding: [0x60,0x44,0x30,0x08]
diff --git a/test/MC/Mips/tls-symbols.s b/test/MC/Mips/tls-symbols.s
new file mode 100644
index 00000000000..d5a31b18950
--- /dev/null
+++ b/test/MC/Mips/tls-symbols.s
@@ -0,0 +1,28 @@
+# RUN: llvm-mc -arch=mips < %s -position-independent -filetype=obj \
+# RUN: | llvm-readelf -symbols | FileCheck %s
+# RUN: llvm-mc -arch=mips < %s -filetype=obj | llvm-readelf -symbols \
+# RUN: | FileCheck %s
+
+# Test that TLS relocations cause symbols to be marked as TLS symbols.
+
+ .set noat
+ lui $3, %tlsgd(foo1)
+ lui $1, %dtprel_hi(foo2)
+ lui $1, %dtprel_lo(foo3)
+ lui $1, %tprel_hi(foo4)
+ lui $1, %tprel_lo(foo5)
+ lw $2, %gottprel(foo6)($28)
+
+ .hidden foo1
+ .hidden foo2
+ .hidden foo3
+ .hidden foo4
+ .hidden foo5
+ .hidden foo6
+
+# CHECK: 1: {{.+}} {{.+}} TLS GLOBAL HIDDEN UND foo1
+# CHECK: 2: {{.+}} {{.+}} TLS GLOBAL HIDDEN UND foo2
+# CHECK: 3: {{.+}} {{.+}} TLS GLOBAL HIDDEN UND foo3
+# CHECK: 4: {{.+}} {{.+}} TLS GLOBAL HIDDEN UND foo4
+# CHECK: 5: {{.+}} {{.+}} TLS GLOBAL HIDDEN UND foo5
+# CHECK: 6: {{.+}} {{.+}} TLS GLOBAL HIDDEN UND foo6
diff --git a/test/Object/Inputs/trivial-object-test.coff-arm64 b/test/Object/Inputs/trivial-object-test.coff-arm64
new file mode 100644
index 00000000000..0d23aa29524
--- /dev/null
+++ b/test/Object/Inputs/trivial-object-test.coff-arm64
Binary files differ
diff --git a/test/Object/Inputs/trivial-object-test.coff-armnt b/test/Object/Inputs/trivial-object-test.coff-armnt
new file mode 100644
index 00000000000..5bbf79f1b54
--- /dev/null
+++ b/test/Object/Inputs/trivial-object-test.coff-armnt
Binary files differ
diff --git a/test/Object/archive-SYM64-write.test b/test/Object/archive-SYM64-write.test
new file mode 100644
index 00000000000..161d6cb8191
--- /dev/null
+++ b/test/Object/archive-SYM64-write.test
@@ -0,0 +1,38 @@
+# REQUIRES: llvm-64-bits
+# REQUIRES: system-linux
+
+# RUN: yaml2obj %s > %t
+# RUN: dd if=%t of=%t bs=1 count=0 seek=2200M
+# RUN: rm -f %t.lib
+# RUN: cp %t %t2
+# RUN: llvm-ar cr %t.lib %t %t2 %p/Inputs/trivial-object-test.elf-x86-64
+# RUN: llvm-nm --print-armap %t.lib | FileCheck %s
+
+# Delete temp files. They are too large.
+# RUN: rm -f %t %t2 %t.lib
+
+!ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_EXEC
+ Machine: EM_X86_64
+Sections:
+ - Name: .data
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ AddressAlign: 0x0000000000000001
+ Content: "00"
+ Size: 32
+
+# CHECK: Archive map
+# CHECK-NEXT: main in trivial-object-test.elf-x86-64
+
+# CHECK: archive-SYM64-write.test.tmp:
+
+# CHECK: archive-SYM64-write.test.tmp2:
+
+# CHECK: trivial-object-test.elf-x86-64:
+# CHECK-NEXT: U SomeOtherFunction
+# CHECK-NEXT: 0000000000000000 T main
+# CHECK-NEXT: U puts
diff --git a/test/Object/obj2yaml.test b/test/Object/obj2yaml.test
index 3d89f53bafc..7b274b31bb1 100644
--- a/test/Object/obj2yaml.test
+++ b/test/Object/obj2yaml.test
@@ -1,5 +1,7 @@
RUN: obj2yaml %p/Inputs/trivial-object-test.coff-i386 | FileCheck %s --check-prefix COFF-I386
RUN: obj2yaml %p/Inputs/trivial-object-test.coff-x86-64 | FileCheck %s --check-prefix COFF-X86-64
+RUN: obj2yaml %p/Inputs/trivial-object-test.coff-armnt | FileCheck %s --check-prefix COFF-ARMNT
+RUN: obj2yaml %p/Inputs/trivial-object-test.coff-arm64 | FileCheck %s --check-prefix COFF-ARM64
RUN: obj2yaml %p/Inputs/trivial-object-test.elf-mipsel | FileCheck %s --check-prefix ELF-MIPSEL
RUN: obj2yaml %p/Inputs/trivial-object-test.elf-mips64el | FileCheck %s --check-prefix ELF-MIPS64EL
RUN: obj2yaml %p/Inputs/trivial-object-test.elf-x86-64 | FileCheck %s --check-prefix ELF-X86-64
@@ -189,6 +191,162 @@ COFF-X86-64-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
COFF-X86-64-NEXT: ComplexType: IMAGE_SYM_DTYPE_FUNCTION
COFF-X86-64-NEXT: StorageClass: IMAGE_SYM_CLASS_STATIC
+COFF-ARMNT: header:
+COFF-ARMNT-NEXT: Machine: IMAGE_FILE_MACHINE_ARMNT
+
+COFF-ARMNT: sections:
+COFF-ARMNT-NEXT: - Name: .text
+COFF-ARMNT-NEXT: Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_PURGEABLE, IMAGE_SCN_MEM_16BIT, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+COFF-ARMNT-NEXT: Alignment: 4
+COFF-ARMNT-NEXT: SectionData: 00F000F87047
+
+COFF-ARMNT: Relocations:
+COFF-ARMNT-NEXT: - VirtualAddress: 0
+COFF-ARMNT-NEXT: SymbolName: otherFunc
+COFF-ARMNT-NEXT: Type: IMAGE_REL_ARM_BLX23T
+
+COFF-ARMNT: - Name: .data
+COFF-ARMNT-NEXT: Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+COFF-ARMNT-NEXT: Alignment: 4
+COFF-ARMNT-NEXT: SectionData: ''
+
+COFF-ARMNT: - Name: .bss
+COFF-ARMNT-NEXT: Characteristics: [ IMAGE_SCN_CNT_UNINITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+COFF-ARMNT-NEXT: Alignment: 4
+COFF-ARMNT-NEXT: SectionData: ''
+
+COFF-ARMNT: symbols:
+COFF-ARMNT-NEXT: - Name: .text
+COFF-ARMNT-NEXT: Value: 0
+COFF-ARMNT-NEXT: SectionNumber: 1
+COFF-ARMNT-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARMNT-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARMNT-NEXT: StorageClass: IMAGE_SYM_CLASS_STATIC
+COFF-ARMNT-NEXT: SectionDefinition:
+COFF-ARMNT-NEXT: Length: 6
+COFF-ARMNT-NEXT: NumberOfRelocations: 1
+COFF-ARMNT-NEXT: NumberOfLinenumbers: 0
+COFF-ARMNT-NEXT: CheckSum: 879026160
+COFF-ARMNT-NEXT: Number: 1
+
+COFF-ARMNT: - Name: .data
+COFF-ARMNT-NEXT: Value: 0
+COFF-ARMNT-NEXT: SectionNumber: 2
+COFF-ARMNT-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARMNT-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARMNT-NEXT: StorageClass: IMAGE_SYM_CLASS_STATIC
+COFF-ARMNT-NEXT: SectionDefinition:
+COFF-ARMNT-NEXT: Length: 0
+COFF-ARMNT-NEXT: NumberOfRelocations: 0
+COFF-ARMNT-NEXT: NumberOfLinenumbers: 0
+COFF-ARMNT-NEXT: CheckSum: 0
+COFF-ARMNT-NEXT: Number: 2
+
+COFF-ARMNT: - Name: .bss
+COFF-ARMNT-NEXT: Value: 0
+COFF-ARMNT-NEXT: SectionNumber: 3
+COFF-ARMNT-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARMNT-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARMNT-NEXT: StorageClass: IMAGE_SYM_CLASS_STATIC
+COFF-ARMNT-NEXT: SectionDefinition:
+COFF-ARMNT-NEXT: Length: 0
+COFF-ARMNT-NEXT: NumberOfRelocations: 0
+COFF-ARMNT-NEXT: NumberOfLinenumbers: 0
+COFF-ARMNT-NEXT: CheckSum: 0
+COFF-ARMNT-NEXT: Number: 3
+
+COFF-ARMNT: - Name: main
+COFF-ARMNT-NEXT: Value: 0
+COFF-ARMNT-NEXT: SectionNumber: 1
+COFF-ARMNT-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARMNT-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARMNT-NEXT: StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+
+COFF-ARMNT: - Name: otherFunc
+COFF-ARMNT-NEXT: Value: 0
+COFF-ARMNT-NEXT: SectionNumber: 0
+COFF-ARMNT-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARMNT-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARMNT-NEXT: StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+
+COFF-ARM64: header:
+COFF-ARM64-NEXT: Machine: IMAGE_FILE_MACHINE_ARM64
+
+COFF-ARM64: sections:
+COFF-ARM64-NEXT: - Name: .text
+COFF-ARM64-NEXT: Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+COFF-ARM64-NEXT: Alignment: 4
+COFF-ARM64-NEXT: SectionData: 00000094C0035FD6
+
+COFF-ARM64: Relocations:
+COFF-ARM64-NEXT: - VirtualAddress: 0
+COFF-ARM64-NEXT: SymbolName: otherFunc
+COFF-ARM64-NEXT: Type: IMAGE_REL_ARM64_BRANCH26
+
+COFF-ARM64: - Name: .data
+COFF-ARM64-NEXT: Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+COFF-ARM64-NEXT: Alignment: 4
+COFF-ARM64-NEXT: SectionData: ''
+
+COFF-ARM64: - Name: .bss
+COFF-ARM64-NEXT: Characteristics: [ IMAGE_SCN_CNT_UNINITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+COFF-ARM64-NEXT: Alignment: 4
+COFF-ARM64-NEXT: SectionData: ''
+
+COFF-ARM64: symbols:
+COFF-ARM64-NEXT: - Name: .text
+COFF-ARM64-NEXT: Value: 0
+COFF-ARM64-NEXT: SectionNumber: 1
+COFF-ARM64-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARM64-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARM64-NEXT: StorageClass: IMAGE_SYM_CLASS_STATIC
+COFF-ARM64-NEXT: SectionDefinition:
+COFF-ARM64-NEXT: Length: 8
+COFF-ARM64-NEXT: NumberOfRelocations: 1
+COFF-ARM64-NEXT: NumberOfLinenumbers: 0
+COFF-ARM64-NEXT: CheckSum: 35579893
+COFF-ARM64-NEXT: Number: 1
+
+COFF-ARM64: - Name: .data
+COFF-ARM64-NEXT: Value: 0
+COFF-ARM64-NEXT: SectionNumber: 2
+COFF-ARM64-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARM64-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARM64-NEXT: StorageClass: IMAGE_SYM_CLASS_STATIC
+COFF-ARM64-NEXT: SectionDefinition:
+COFF-ARM64-NEXT: Length: 0
+COFF-ARM64-NEXT: NumberOfRelocations: 0
+COFF-ARM64-NEXT: NumberOfLinenumbers: 0
+COFF-ARM64-NEXT: CheckSum: 0
+COFF-ARM64-NEXT: Number: 2
+
+COFF-ARM64: - Name: .bss
+COFF-ARM64-NEXT: Value: 0
+COFF-ARM64-NEXT: SectionNumber: 3
+COFF-ARM64-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARM64-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARM64-NEXT: StorageClass: IMAGE_SYM_CLASS_STATIC
+COFF-ARM64-NEXT: SectionDefinition:
+COFF-ARM64-NEXT: Length: 0
+COFF-ARM64-NEXT: NumberOfRelocations: 0
+COFF-ARM64-NEXT: NumberOfLinenumbers: 0
+COFF-ARM64-NEXT: CheckSum: 0
+COFF-ARM64-NEXT: Number: 3
+
+COFF-ARM64: - Name: main
+COFF-ARM64-NEXT: Value: 0
+COFF-ARM64-NEXT: SectionNumber: 1
+COFF-ARM64-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARM64-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARM64-NEXT: StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+
+COFF-ARM64: - Name: otherFunc
+COFF-ARM64-NEXT: Value: 0
+COFF-ARM64-NEXT: SectionNumber: 0
+COFF-ARM64-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARM64-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARM64-NEXT: StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+
ELF-MIPSEL: FileHeader:
ELF-MIPSEL-NEXT: Class: ELFCLASS32
ELF-MIPSEL-NEXT: Data: ELFDATA2LSB
diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll
index 816f75310e3..0810a13c141 100644
--- a/test/Other/new-pm-defaults.ll
+++ b/test/Other/new-pm-defaults.ll
@@ -76,6 +76,7 @@
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass
+; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass
; CHECK-O-NEXT: Finished llvm::Function pass manager run.
; CHECK-O-NEXT: Running pass: IPSCCPPass
; CHECK-O-NEXT: Running pass: CalledValuePropagationPass
diff --git a/test/Other/new-pm-lto-defaults.ll b/test/Other/new-pm-lto-defaults.ll
index fc52f70ff4c..878198d1447 100644
--- a/test/Other/new-pm-lto-defaults.ll
+++ b/test/Other/new-pm-lto-defaults.ll
@@ -29,9 +29,14 @@
; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass
; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass
; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
+; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Module
+; CHECK-O2-NEXT: Starting llvm::Function pass manager run.
+; CHECK-O2-NEXT: Running pass: CallSiteSplittingPass on foo
+; CHECK-O2-NEXT: Running analysis: TargetLibraryAnalysis on foo
+; CHECK-O2-NEXT: Finished llvm::Function pass manager run.
; CHECK-O2-NEXT: PGOIndirectCallPromotion
; CHECK-O2-NEXT: Running analysis: ProfileSummaryAnalysis
-; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Function
; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
; CHECK-O2-NEXT: Running pass: IPSCCPPass
; CHECK-O2-NEXT: Running pass: CalledValuePropagationPass
@@ -42,7 +47,7 @@
; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy
; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}>
; CHECK-O-NEXT: Running analysis: AAManager
-; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-O1-NEXT: Running analysis: TargetLibraryAnalysis
; CHECK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass
; CHECK-O-NEXT: Running analysis: CallGraphAnalysis
; CHECK-O-NEXT: Running pass: GlobalSplitPass
diff --git a/test/Other/new-pm-thinlto-defaults.ll b/test/Other/new-pm-thinlto-defaults.ll
index 7d40ef3eea2..e83f0f87055 100644
--- a/test/Other/new-pm-thinlto-defaults.ll
+++ b/test/Other/new-pm-thinlto-defaults.ll
@@ -72,6 +72,7 @@
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass
+; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass
; CHECK-O-NEXT: Finished llvm::Function pass manager run.
; CHECK-O-NEXT: Running pass: IPSCCPPass
; CHECK-O-NEXT: Running pass: CalledValuePropagationPass
diff --git a/test/ThinLTO/X86/deadstrip.ll b/test/ThinLTO/X86/deadstrip.ll
index c19ccb01be3..90de3bb9a32 100644
--- a/test/ThinLTO/X86/deadstrip.ll
+++ b/test/ThinLTO/X86/deadstrip.ll
@@ -18,8 +18,8 @@
; RUN: -r %t2.bc,_boo,pl \
; RUN: -r %t2.bc,_dead_func,pl \
; RUN: -r %t2.bc,_another_dead_func,pl
-; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s
-; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=CHECK2
+; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s --check-prefix=LTO2
+; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=LTO2-CHECK2
; RUN: llvm-nm %t.out.1 | FileCheck %s --check-prefix=CHECK2-NM
; RUN: llvm-bcanalyzer -dump %t.out.index.bc | FileCheck %s --check-prefix=COMBINED
@@ -27,14 +27,14 @@
; COMBINED-DAG: <COMBINED {{.*}} op2=55
; Live, Internal
; COMBINED-DAG: <COMBINED {{.*}} op2=39
-; Live, External
-; COMBINED-DAG: <COMBINED {{.*}} op2=32
-; COMBINED-DAG: <COMBINED {{.*}} op2=32
-; COMBINED-DAG: <COMBINED {{.*}} op2=32
-; (Dead)
-; COMBINED-DAG: <COMBINED {{.*}} op2=0
-; COMBINED-DAG: <COMBINED {{.*}} op2=0
-; COMBINED-DAG: <COMBINED {{.*}} op2=0
+; Live, Local, External
+; COMBINED-DAG: <COMBINED {{.*}} op2=96
+; COMBINED-DAG: <COMBINED {{.*}} op2=96
+; COMBINED-DAG: <COMBINED {{.*}} op2=96
+; Local, (Dead)
+; COMBINED-DAG: <COMBINED {{.*}} op2=64
+; COMBINED-DAG: <COMBINED {{.*}} op2=64
+; COMBINED-DAG: <COMBINED {{.*}} op2=64
; Dead-stripping on the index allows to internalize these,
; and limit the import of @baz thanks to early pruning.
@@ -45,10 +45,18 @@
; CHECK: define internal void @bar_internal()
; CHECK: define internal void @dead_func() {
; CHECK-NOT: available_externally {{.*}} @baz()
+; LTO2-NOT: available_externally {{.*}} @baz()
+; LTO2: @llvm.global_ctors =
+; LTO2: define internal void @_GLOBAL__I_a()
+; LTO2: define internal dso_local void @bar() {
+; LTO2: define internal void @bar_internal()
+; LTO2: define internal dso_local void @dead_func() {
+; LTO2-NOT: available_externally {{.*}} @baz()
; Make sure we didn't internalize @boo, which is reachable via
; llvm.global_ctors
; CHECK2: define void @boo()
+; LTO2-CHECK2: define dso_local void @boo()
; We should have eventually removed @baz since it was internalized and unused
; CHECK2-NM-NOT: _baz
@@ -80,7 +88,7 @@
; We can't internalize @dead_func because of the use in the regular LTO
; partition.
-; CHECK-NOTDEAD: define void @dead_func()
+; CHECK-NOTDEAD: define dso_local void @dead_func()
; We also can't eliminate @baz because it is in the regular LTO partition
; and called from @dead_func.
; CHECK-NM-NOTDEAD: T _baz
diff --git a/test/ThinLTO/X86/funcimport2.ll b/test/ThinLTO/X86/funcimport2.ll
index 7338f9a9d98..86ce715f4e0 100644
--- a/test/ThinLTO/X86/funcimport2.ll
+++ b/test/ThinLTO/X86/funcimport2.ll
@@ -7,7 +7,7 @@
; RUN: -r=%t2.bc,_main,plx \
; RUN: -r=%t2.bc,_foo,l
; RUN: llvm-dis %t.o.1.3.import.bc -o - | FileCheck %s
-; CHECK: define available_externally void @foo()
+; CHECK: define available_externally dso_local void @foo()
; We shouldn't do any importing at -O0
; rm -f %t.o.1.3.import.bc
@@ -17,7 +17,7 @@
; RUN: -r=%t2.bc,_main,plx \
; RUN: -r=%t2.bc,_foo,l
; RUN: llvm-dis %t.o.1.3.import.bc -o - | FileCheck %s --check-prefix=CHECKO0
-; CHECKO0: declare void @foo(...)
+; CHECKO0: declare dso_local void @foo(...)
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"
diff --git a/test/ThinLTO/X86/internalize.ll b/test/ThinLTO/X86/internalize.ll
index 867e3e5a00a..f40fbcd4b41 100644
--- a/test/ThinLTO/X86/internalize.ll
+++ b/test/ThinLTO/X86/internalize.ll
@@ -1,4 +1,4 @@
-;; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %s -o %t1.bc
; RUN: llvm-lto -thinlto-action=thinlink -o %t.index.bc %t1.bc
; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=REGULAR
; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc -o - --exported-symbol=foo | llvm-dis -o - | FileCheck %s --check-prefix=INTERNALIZE
@@ -7,7 +7,7 @@
; RUN: -r=%t1.bc,_foo,pxl \
; RUN: -r=%t1.bc,_bar,pl \
; RUN: -r=%t1.bc,_linkonce_func,pl
-; RUN: llvm-dis < %t.o.0.2.internalize.bc | FileCheck %s --check-prefix=INTERNALIZE
+; RUN: llvm-dis < %t.o.0.2.internalize.bc | FileCheck %s --check-prefix=INTERNALIZE2
; REGULAR: define void @foo
@@ -16,6 +16,9 @@
; INTERNALIZE: define void @foo
; INTERNALIZE: define internal void @bar
; INTERNALIZE: define internal void @linkonce_func()
+; INTERNALIZE2: define dso_local void @foo
+; INTERNALIZE2: define internal dso_local void @bar
+; INTERNALIZE2: define internal dso_local void @linkonce_func()
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"
@@ -29,4 +32,4 @@ define void @bar() {
}
define linkonce void @linkonce_func() {
ret void
-} \ No newline at end of file
+}
diff --git a/test/ThinLTO/X86/lazyload_metadata.ll b/test/ThinLTO/X86/lazyload_metadata.ll
index a6d46e5586a..4680e462458 100644
--- a/test/ThinLTO/X86/lazyload_metadata.ll
+++ b/test/ThinLTO/X86/lazyload_metadata.ll
@@ -10,13 +10,13 @@
; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \
; RUN: -o /dev/null -stats \
; RUN: 2>&1 | FileCheck %s -check-prefix=LAZY
-; LAZY: 53 bitcode-reader - Number of Metadata records loaded
+; LAZY: 55 bitcode-reader - Number of Metadata records loaded
; LAZY: 2 bitcode-reader - Number of MDStrings loaded
; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \
; RUN: -o /dev/null -disable-ondemand-mds-loading -stats \
; RUN: 2>&1 | FileCheck %s -check-prefix=NOTLAZY
-; NOTLAZY: 62 bitcode-reader - Number of Metadata records loaded
+; NOTLAZY: 64 bitcode-reader - Number of Metadata records loaded
; NOTLAZY: 7 bitcode-reader - Number of MDStrings loaded
diff --git a/test/ThinLTO/X86/reference_non_importable.ll b/test/ThinLTO/X86/reference_non_importable.ll
index 5cf225e95de..99b79ce198e 100644
--- a/test/ThinLTO/X86/reference_non_importable.ll
+++ b/test/ThinLTO/X86/reference_non_importable.ll
@@ -22,7 +22,7 @@ target triple = "x86_64-apple-macosx10.11.0"
; We want foo to be imported in the main module!
; RUN: llvm-dis < %t.o.1.3.import.bc | FileCheck %s --check-prefix=IMPORT
-; IMPORT: define available_externally i8** @foo()
+; IMPORT: define available_externally dso_local i8** @foo()
define i8 **@foo() {
ret i8 **@b
}
diff --git a/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
new file mode 100644
index 00000000000..d1d854d8f45
--- /dev/null
+++ b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
@@ -0,0 +1,339 @@
+; RUN: opt < %s -callsite-splitting -S | FileCheck %s
+; RUN: opt < %s -passes='function(callsite-splitting)' -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linaro-linux-gnueabi"
+
+;CHECK-LABEL: @test_eq_eq
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_eq_eq(i32* %a, i32 %v) {
+Header:
+ %tobool1 = icmp eq i32* %a, null
+ br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+ %cmp = icmp eq i32 %v, 1
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_eq
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_eq(i32* %a, i32 %v) {
+Header:
+ %tobool1 = icmp ne i32* %a, null
+ br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+ %cmp = icmp eq i32 %v, 1
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_ne
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_ne(i32* %a, i32 %v) {
+Header:
+ %tobool1 = icmp ne i32* %a, null
+ br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+ %cmp = icmp ne i32 %v, 1
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_eq_eq_untaken
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_eq_eq_untaken(i32* %a, i32 %v) {
+Header:
+ %tobool1 = icmp eq i32* %a, null
+ br i1 %tobool1, label %TBB, label %Tail
+
+TBB:
+ %cmp = icmp eq i32 %v, 1
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_eq_untaken
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_eq_untaken(i32* %a, i32 %v) {
+Header:
+ %tobool1 = icmp ne i32* %a, null
+ br i1 %tobool1, label %TBB, label %Tail
+
+TBB:
+ %cmp = icmp eq i32 %v, 1
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_ne_untaken
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_ne_untaken(i32* %a, i32 %v) {
+Header:
+ %tobool1 = icmp ne i32* %a, null
+ br i1 %tobool1, label %TBB, label %Tail
+
+TBB:
+ %cmp = icmp ne i32 %v, 1
+ br i1 %cmp, label %End, label %Tail
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_nonconst_const_phi
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_nonconst_const_phi(i32* %a, i32* %b, i32 %v) {
+Header:
+ %tobool1 = icmp eq i32* %a, %b
+ br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+ %cmp = icmp eq i32 %v, 1
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_nonconst_nonconst_phi
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 %v, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_nonconst_nonconst_phi(i32* %a, i32* %b, i32 %v, i32 %v2) {
+Header:
+ %tobool1 = icmp eq i32* %a, %b
+ br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+ %cmp = icmp eq i32 %v, %v2
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_nonconst_nonconst_phi_noncost
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_nonconst_nonconst_phi_noncost(i32* %a, i32* %b, i32 %v, i32 %v2) {
+Header:
+ %tobool1 = icmp eq i32* %a, %b
+ br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+ %cmp = icmp eq i32 %v, %v2
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[%v,%Header], [%v2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_fisrtnonphi
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_fisrtnonphi(i32* %a, i32 %v) {
+Header:
+ %tobool1 = icmp eq i32* %a, null
+ br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+ %cmp = icmp eq i32 %v, 1
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ store i32 %v, i32* %a
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_3preds_constphi
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_3preds_constphi(i32* %a, i32 %v, i1 %c1, i1 %c2, i1 %c3) {
+Header:
+ br i1 %c1, label %Tail, label %TBB1
+
+TBB1:
+ br i1 %c2, label %Tail, label %TBB2
+
+TBB2:
+ br i1 %c3, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB1], [3, %TBB2]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_indirectbr_phi
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_indirectbr_phi(i8* %address, i32* %a, i32* %b, i32 %v) {
+Header:
+ %indirect.goto.dest = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address
+ indirectbr i8* %indirect.goto.dest, [label %TBB, label %Tail]
+
+TBB:
+ %indirect.goto.dest2 = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address
+ indirectbr i8* %indirect.goto.dest2, [label %Tail, label %End]
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+define i32 @callee(i32* %a, i32 %v, i32 %p) {
+entry:
+ %c = icmp ne i32* %a, null
+ br i1 %c, label %BB1, label %BB2
+
+BB1:
+ call void @dummy(i32* %a, i32 %p)
+ br label %End
+
+BB2:
+ call void @dummy2(i32 %v, i32 %p)
+ br label %End
+
+End:
+ ret i32 %p
+}
+
+declare void @dummy(i32*, i32)
+declare void @dummy2(i32, i32)
diff --git a/test/Transforms/CallSiteSplitting/callsite-split.ll b/test/Transforms/CallSiteSplitting/callsite-split.ll
new file mode 100644
index 00000000000..419fa738563
--- /dev/null
+++ b/test/Transforms/CallSiteSplitting/callsite-split.ll
@@ -0,0 +1,119 @@
+; RUN: opt < %s -callsite-splitting -inline -instcombine -jump-threading -S | FileCheck %s
+; RUN: opt < %s -passes='function(callsite-splitting),cgscc(inline),function(instcombine,jump-threading)' -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linaro-linux-gnueabi"
+
+%struct.bitmap = type { i32, %struct.bitmap* }
+
+;CHECK-LABEL: @caller
+;CHECK-LABEL: NextCond:
+;CHECK: br {{.*}} label %callee.exit
+;CHECK-LABEL: CallSiteBB.predBB1.split:
+;CHECK: call void @callee(%struct.bitmap* null, %struct.bitmap* null, %struct.bitmap* %b_elt, i1 false)
+;CHECK-LABEL: callee.exit:
+;CHECK: call void @dummy2(%struct.bitmap* %a_elt)
+
+define void @caller(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt) {
+entry:
+ br label %Top
+
+Top:
+ %tobool1 = icmp eq %struct.bitmap* %a_elt, null
+ br i1 %tobool1, label %CallSiteBB, label %NextCond
+
+NextCond:
+ %cmp = icmp ne %struct.bitmap* %b_elt, null
+ br i1 %cmp, label %CallSiteBB, label %End
+
+CallSiteBB:
+ %p = phi i1 [0, %Top], [%c, %NextCond]
+ call void @callee(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %p)
+ br label %End
+
+End:
+ ret void
+}
+
+define void @callee(%struct.bitmap* %dst_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %c) {
+entry:
+ %tobool = icmp ne %struct.bitmap* %a_elt, null
+ %tobool1 = icmp ne %struct.bitmap* %b_elt, null
+ %or.cond = and i1 %tobool, %tobool1
+ br i1 %or.cond, label %Cond, label %Big
+
+Cond:
+ %cmp = icmp eq %struct.bitmap* %dst_elt, %a_elt
+ br i1 %cmp, label %Small, label %Big
+
+Small:
+ call void @dummy2(%struct.bitmap* %a_elt)
+ br label %End
+
+Big:
+ call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+ call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+ call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+ call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+ call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+ call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+ call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+ br label %End
+
+End:
+ ret void
+}
+
+declare void @dummy2(%struct.bitmap*)
+declare void @dummy1(%struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*)
+
+
+;CHECK-LABEL: @caller2
+;CHECK-LABEL: CallSiteBB.predBB1.split:
+;CHECK: call void @dummy4()
+;CHECK-LABEL: CallSiteBB.predBB2.split:
+;CHECK: call void @dummy3()
+;CheCK-LABEL: CallSiteBB:
+;CHECK: %phi.call = phi i1 [ false, %CallSiteBB.predBB1.split ], [ true, %CallSiteBB.predBB2.split ]
+;CHECK: call void @foo(i1 %phi.call)
+define void @caller2(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, %struct.bitmap* %c_elt) {
+entry:
+ br label %Top
+
+Top:
+ %tobool1 = icmp eq %struct.bitmap* %a_elt, %b_elt
+ br i1 %tobool1, label %CallSiteBB, label %NextCond
+
+NextCond:
+ %cmp = icmp ne %struct.bitmap* %b_elt, %c_elt
+ br i1 %cmp, label %CallSiteBB, label %End
+
+CallSiteBB:
+ %phi = phi i1 [0, %Top],[1, %NextCond]
+ %u = call i1 @callee2(i1 %phi)
+ call void @foo(i1 %u)
+ br label %End
+
+End:
+ ret void
+}
+
+define i1 @callee2(i1 %b) {
+entry:
+ br i1 %b, label %BB1, label %BB2
+
+BB1:
+ call void @dummy3()
+ br label %End
+
+BB2:
+ call void @dummy4()
+ br label %End
+
+End:
+ ret i1 %b
+}
+
+declare void @dummy3()
+declare void @dummy4()
+declare void @foo(i1)
diff --git a/test/Transforms/CodeExtractor/PartialInlineNoInline.ll b/test/Transforms/CodeExtractor/PartialInlineNoInline.ll
new file mode 100644
index 00000000000..6c0b83298d2
--- /dev/null
+++ b/test/Transforms/CodeExtractor/PartialInlineNoInline.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -partial-inliner -S -stats -pass-remarks=partial-inlining 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -S -stats -pass-remarks=partial-inlining 2>&1 | FileCheck %s
+
+@stat = external global i32, align 4
+
+define i32 @inline_fail(i32 %count, ...) {
+entry:
+ %vargs = alloca i8*, align 8
+ %vargs1 = bitcast i8** %vargs to i8*
+ call void @llvm.va_start(i8* %vargs1)
+ %stat1 = load i32, i32* @stat, align 4
+ %cmp = icmp slt i32 %stat1, 0
+ br i1 %cmp, label %bb2, label %bb1
+
+bb1: ; preds = %entry
+ %vg1 = add nsw i32 %stat1, 1
+ store i32 %vg1, i32* @stat, align 4
+ %va1 = va_arg i8** %vargs, i32
+ call void @foo(i32 %count, i32 %va1) #2
+ br label %bb2
+
+bb2: ; preds = %bb1, %entry
+ %res = phi i32 [ 1, %bb1 ], [ 0, %entry ]
+ call void @llvm.va_end(i8* %vargs1)
+ ret i32 %res
+}
+
+define i32 @caller(i32 %arg) {
+bb:
+ %res = tail call i32 (i32, ...) @inline_fail(i32 %arg, i32 %arg)
+ ret i32 %res
+}
+
+declare void @foo(i32, i32)
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_end(i8*)
+
+; Check that no remarks have been emitted, inline_fail has not been partial
+; inlined, no code has been extracted and the partial-inlining counter
+; has not been incremented.
+
+; CHECK-NOT: remark
+; CHECK: tail call i32 (i32, ...) @inline_fail(i32 %arg, i32 %arg)
+; CHECK-NOT: inline_fail.1_bb1
+; CHECK-NOT: partial-inlining
diff --git a/test/Transforms/CodeGenPrepare/ARM/sink-addrmode.ll b/test/Transforms/CodeGenPrepare/ARM/sink-addrmode.ll
new file mode 100644
index 00000000000..06a513543c4
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/ARM/sink-addrmode.ll
@@ -0,0 +1,18 @@
+; RUN: opt -S -codegenprepare -mtriple=thumbv7m -disable-complex-addr-modes=false -addr-sink-new-select=true < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; Select between two geps with different base, same constant offset
+define void @test_select_twogep_base(i32* %ptr1, i32* %ptr2, i32 %value) {
+; CHECK-LABEL: @test_select_twogep_base
+; CHECK-NOT: select i1 %cmp, i32* %gep1, i32* %gep2
+; CHECK: select i1 %cmp, i32* %ptr1, i32* %ptr2
+entry:
+ %cmp = icmp sgt i32 %value, 0
+ %gep1 = getelementptr inbounds i32, i32* %ptr1, i32 1
+ %gep2 = getelementptr inbounds i32, i32* %ptr2, i32 1
+ %select = select i1 %cmp, i32* %gep1, i32* %gep2
+ store i32 %value, i32* %select, align 4
+ ret void
+}
+
diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll
new file mode 100644
index 00000000000..2bacbdd7f40
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll
@@ -0,0 +1,475 @@
+; RUN: opt -S -codegenprepare -disable-complex-addr-modes=false -addr-sink-new-phis=true -addr-sink-new-select=true %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-YES
+; RUN: opt -S -codegenprepare -disable-complex-addr-modes=false -addr-sink-new-phis=false -addr-sink-new-select=true %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NO
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Can we sink for different base if there is no phi for base?
+define i32 @test1(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test1
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO-NEXT: load
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ %v = load i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; Can we sink for different base if there is phi for base?
+define i32 @test2(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test2
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK: getelementptr i8, {{.+}} 40
+ %b = phi i64* [%b1, %entry], [%b2, %if.then]
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ %v = load i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; Can we sink for different base if there is phi for base but not valid one?
+define i32 @test3(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test3
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO: phi
+; CHECK-NO-NEXT: load
+ %b = phi i64* [%b2, %entry], [%b1, %if.then]
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ %v = load i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; Can we sink for different base if both addresses are in the same block?
+define i32 @test4(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test4
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO-NEXT: load
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ %v = load i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; Can we sink for different base if there is phi for base?
+; Both addresses are in the same block.
+define i32 @test5(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test5
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ br label %fallthrough
+
+fallthrough:
+; CHECK: getelementptr i8, {{.+}} 40
+ %b = phi i64* [%b1, %entry], [%b2, %if.then]
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ %v = load i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; Can we sink for different base if there is phi for base but not valid one?
+; Both addresses are in the same block.
+define i32 @test6(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test6
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: load
+ %b = phi i64* [%b2, %entry], [%b1, %if.then]
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ %v = load i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; case with a loop. No phi node.
+define i32 @test7(i32 %N, i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test7
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK-YES: sunk_phi
+ %iv = phi i32 [0, %entry], [%iv.inc, %fallthrough]
+ %c3 = phi i32* [%c1, %entry], [%c, %fallthrough]
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO-NEXT: load
+ %c = phi i32* [%c3, %loop], [%c2, %if.then]
+ %v = load volatile i32, i32* %c, align 4
+ %iv.inc = add i32 %iv, 1
+ %cmp = icmp slt i32 %iv.inc, %N
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+ ret i32 %v
+}
+
+; case with a loop. There is phi node.
+define i32 @test8(i32 %N, i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test8
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br label %loop
+
+loop:
+ %iv = phi i32 [0, %entry], [%iv.inc, %fallthrough]
+ %c3 = phi i32* [%c1, %entry], [%c, %fallthrough]
+ %b3 = phi i64* [%b1, %entry], [%b, %fallthrough]
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK: getelementptr i8, {{.+}} 40
+ %c = phi i32* [%c3, %loop], [%c2, %if.then]
+ %b = phi i64* [%b3, %loop], [%b2, %if.then]
+ %v = load volatile i32, i32* %c, align 4
+ %iv.inc = add i32 %iv, 1
+ %cmp = icmp slt i32 %iv.inc, %N
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+ ret i32 %v
+}
+
+; case with a loop. There is phi node but it does not fit.
+define i32 @test9(i32 %N, i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test9
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK-YES: sunk_phi
+ %iv = phi i32 [0, %entry], [%iv.inc, %fallthrough]
+ %c3 = phi i32* [%c1, %entry], [%c, %fallthrough]
+ %b3 = phi i64* [%b1, %entry], [%b2, %fallthrough]
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: load
+ %c = phi i32* [%c3, %loop], [%c2, %if.then]
+ %b = phi i64* [%b3, %loop], [%b2, %if.then]
+ %v = load volatile i32, i32* %c, align 4
+ %iv.inc = add i32 %iv, 1
+ %cmp = icmp slt i32 %iv.inc, %N
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+ ret i32 %v
+}
+
+; Case through a loop. No phi node.
+define i32 @test10(i32 %N, i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test10
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: br
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ br label %loop
+
+loop:
+ %iv = phi i32 [0, %fallthrough], [%iv.inc, %loop]
+ %iv.inc = add i32 %iv, 1
+ %cmp = icmp slt i32 %iv.inc, %N
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+; CHECK-YES: sunkaddr
+ %v = load volatile i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; Case through a loop. There is a phi.
+define i32 @test11(i32 %N, i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test11
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK: phi
+; CHECK: phi
+; CHECK: br
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ %b = phi i64* [%b1, %entry], [%b2, %if.then]
+ br label %loop
+
+loop:
+ %iv = phi i32 [0, %fallthrough], [%iv.inc, %loop]
+ %iv.inc = add i32 %iv, 1
+ %cmp = icmp slt i32 %iv.inc, %N
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+; CHECK: sunkaddr
+ %v = load volatile i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; Complex case with address value from previous iteration.
+define i32 @test12(i32 %N, i1 %cond, i64* %b1, i64* %b2, i64* %b3) {
+; CHECK-LABEL: @test12
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK-YES: sunk_phi
+; CHECK-NO: phi
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: br
+ %iv = phi i32 [0, %entry], [%iv.inc, %backedge]
+ %c3 = phi i32* [%c1, %entry], [%c, %backedge]
+ %b4 = phi i64* [%b1, %entry], [%b5, %backedge]
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK-LABEL: fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO: phi
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: load
+ %c = phi i32* [%c3, %loop], [%c2, %if.then]
+ %b6 = phi i64* [%b4, %loop], [%b2, %if.then]
+ %v = load volatile i32, i32* %c, align 4
+ %a4 = getelementptr inbounds i64, i64* %b4, i64 5
+ %c4 = bitcast i64* %a4 to i32*
+ %cmp = icmp slt i32 %iv, 20
+ br i1 %cmp, label %backedge, label %if.then.2
+
+if.then.2:
+ br label %backedge
+
+backedge:
+ %b5 = phi i64* [%b4, %fallthrough], [%b6, %if.then.2]
+ %iv.inc = add i32 %iv, 1
+ %cmp2 = icmp slt i32 %iv.inc, %N
+ br i1 %cmp2, label %loop, label %exit
+
+exit:
+ ret i32 %v
+}
+
+%struct.S = type {i32, i32}
+; Case with index
+define i32 @test13(i1 %cond, %struct.S* %b1, %struct.S* %b2, i64 %Index) {
+; CHECK-LABEL: @test13
+entry:
+ %a1 = getelementptr inbounds %struct.S, %struct.S* %b1, i64 %Index, i32 1
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %i2 = mul i64 %Index, 2
+ %a2 = getelementptr inbounds %struct.S, %struct.S* %b2, i64 %Index, i32 1
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: load
+ %a = phi i32* [%a1, %entry], [%a2, %if.then]
+ %v = load i32, i32* %a, align 4
+ ret i32 %v
+}
+
+; Select of Select case.
+define i64 @test14(i1 %c1, i1 %c2, i64* %b1, i64* %b2, i64* %b3) {
+; CHECK-LABEL: @test14
+entry:
+; CHECK-LABEL: entry:
+ %g1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %g2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %g3 = getelementptr inbounds i64, i64* %b3, i64 5
+ %s1 = select i1 %c1, i64* %g1, i64* %g2
+ %s2 = select i1 %c2, i64* %s1, i64* %g3
+; CHECK: sunkaddr
+ %v = load i64 , i64* %s2, align 8
+ ret i64 %v
+}
+
+; Select of Phi case.
+define i64 @test15(i1 %c1, i1 %c2, i64* %b1, i64* %b2, i64* %b3) {
+; CHECK-LABEL: @test15
+entry:
+ %g1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %g2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %g3 = getelementptr inbounds i64, i64* %b3, i64 5
+ br i1 %c1, label %if.then, label %fallthrough
+
+if.then:
+ br label %fallthrough
+
+fallthrough:
+; CHECK-LABEL: fallthrough:
+ %p1 = phi i64* [%g1, %entry], [%g2, %if.then]
+ %s1 = select i1 %c2, i64* %p1, i64* %g3
+; CHECK-YES: sunkaddr
+; CHECK-NO: phi
+; CHECK-NO-NEXT: select
+; CHECK-NO-NEXT: load
+ %v = load i64 , i64* %s1, align 8
+ ret i64 %v
+}
+
+; Select of Phi case. Phi exists
+define i64 @test16(i1 %c1, i1 %c2, i64* %b1, i64* %b2, i64* %b3) {
+; CHECK-LABEL: @test16
+entry:
+ %g1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %g2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %g3 = getelementptr inbounds i64, i64* %b3, i64 5
+ br i1 %c1, label %if.then, label %fallthrough
+
+if.then:
+ br label %fallthrough
+
+fallthrough:
+; CHECK-LABEL: fallthrough:
+ %p = phi i64* [%b1, %entry], [%b2, %if.then]
+ %p1 = phi i64* [%g1, %entry], [%g2, %if.then]
+ %s1 = select i1 %c2, i64* %p1, i64* %g3
+; CHECK: sunkaddr
+ %v = load i64 , i64* %s1, align 8
+ ret i64 %v
+}
+
+; Phi of Select case.
+define i64 @test17(i1 %c1, i1 %c2, i64* %b1, i64* %b2, i64* %b3) {
+; CHECK-LABEL: @test17
+entry:
+ %g1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %g2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %g3 = getelementptr inbounds i64, i64* %b3, i64 5
+ %s1 = select i1 %c2, i64* %g1, i64* %g2
+ br i1 %c1, label %if.then, label %fallthrough
+
+if.then:
+ br label %fallthrough
+
+fallthrough:
+; CHECK-LABEL: fallthrough:
+ %p1 = phi i64* [%s1, %entry], [%g3, %if.then]
+; CHECK-YES: sunkaddr
+; CHECK-NO: phi
+; CHECK-NO-NEXT: load
+ %v = load i64 , i64* %p1, align 8
+ ret i64 %v
+}
diff --git a/test/LibDriver/lit.local.cfg b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg
index e71f3cc4c41..e71f3cc4c41 100644
--- a/test/LibDriver/lit.local.cfg
+++ b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg
diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/ExpandMemCmp/X86/memcmp.ll
index a4f635c956d..1abfb20f369 100644
--- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll
+++ b/test/Transforms/ExpandMemCmp/X86/memcmp.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: opt -S -expandmemcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: opt -S -expandmemcmp -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64
declare i32 @memcmp(i8* nocapture, i8* nocapture, i64)
@@ -23,30 +23,33 @@ define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp3(
-; ALL-NEXT: loadbb:
-; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16*
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16*
-; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]]
-; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]]
-; ALL-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
-; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; ALL-NEXT: [[TMP6:%.*]] = icmp eq i16 [[TMP4]], [[TMP5]]
-; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; ALL-NEXT: br label [[LOADBB:%.*]]
; ALL: res_block:
-; ALL-NEXT: [[TMP7:%.*]] = icmp ult i16 [[TMP4]], [[TMP5]]
-; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i16 [ [[TMP7:%.*]], [[LOADBB]] ]
+; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i16 [ [[TMP8:%.*]], [[LOADBB]] ]
+; ALL-NEXT: [[TMP1:%.*]] = icmp ult i16 [[PHI_SRC1]], [[PHI_SRC2]]
+; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; ALL-NEXT: br label [[ENDBLOCK:%.*]]
+; ALL: loadbb:
+; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; ALL-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]]
+; ALL-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP4]]
+; ALL-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
+; ALL-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
+; ALL-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
+; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
; ALL: loadbb1:
-; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 2
-; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 2
-; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]]
+; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 2
+; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 2
; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]]
-; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]]
; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; ALL-NEXT: br label [[ENDBLOCK]]
; ALL: endblock:
-; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; ALL-NEXT: ret i32 [[PHI_RES]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
@@ -74,30 +77,33 @@ define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp5(
-; ALL-NEXT: loadbb:
-; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
-; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
-; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; ALL-NEXT: br label [[LOADBB:%.*]]
; ALL: res_block:
-; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]]
-; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ]
+; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ]
+; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; ALL-NEXT: br label [[ENDBLOCK:%.*]]
+; ALL: loadbb:
+; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]]
+; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]]
+; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
; ALL: loadbb1:
-; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 4
-; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 4
-; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]]
+; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 4
+; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 4
; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]]
-; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]]
; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; ALL-NEXT: br label [[ENDBLOCK]]
; ALL: endblock:
-; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; ALL-NEXT: ret i32 [[PHI_RES]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
@@ -106,36 +112,37 @@ define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp6(
-; ALL-NEXT: loadbb:
-; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
-; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
-; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; ALL-NEXT: br label [[LOADBB:%.*]]
; ALL: res_block:
-; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
-; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
-; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
-; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
+; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; ALL-NEXT: br label [[ENDBLOCK:%.*]]
+; ALL: loadbb:
+; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]]
+; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]]
+; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
; ALL: loadbb1:
-; ALL-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
-; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
-; ALL-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2
+; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16*
+; ALL-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16*
; ALL-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2
-; ALL-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]]
+; ALL-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2
; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]]
-; ALL-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
+; ALL-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]]
; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; ALL-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i32
+; ALL-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
; ALL-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32
-; ALL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]]
-; ALL-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; ALL-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i32
+; ALL-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]]
+; ALL-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
; ALL: endblock:
-; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; ALL-NEXT: ret i32 [[PHI_RES]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
@@ -153,34 +160,35 @@ define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-LABEL: @cmp8(
-; X32-NEXT: loadbb:
-; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
-; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
-; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32-NEXT: br label [[LOADBB:%.*]]
; X32: res_block:
-; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
-; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ]
-; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
-; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
+; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X32-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; X32-NEXT: br label [[ENDBLOCK:%.*]]
+; X32: loadbb:
+; X32-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]]
+; X32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]]
+; X32-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; X32-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
; X32: loadbb1:
-; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
-; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
-; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32*
; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
-; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]]
-; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]]
; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
-; X32-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
+; X32-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
+; X32-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
; X32: endblock:
-; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; X32-NEXT: ret i32 [[PHI_RES]]
;
; X64-LABEL: @cmp8(
@@ -207,30 +215,33 @@ define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-NEXT: ret i32 [[CALL]]
;
; X64-LABEL: @cmp9(
-; X64-NEXT: loadbb:
-; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
-; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
-; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]]
-; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ]
+; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ]
+; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; X64-NEXT: br label [[ENDBLOCK:%.*]]
+; X64: loadbb:
+; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
; X64: loadbb1:
-; X64-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 8
-; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 8
-; X64-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]]
+; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8
+; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 8
; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]]
-; X64-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]]
; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X64-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; X64-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; X64-NEXT: br label [[ENDBLOCK]]
; X64: endblock:
-; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; X64-NEXT: ret i32 [[PHI_RES]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
@@ -243,36 +254,37 @@ define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-NEXT: ret i32 [[CALL]]
;
; X64-LABEL: @cmp10(
-; X64-NEXT: loadbb:
-; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
-; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
-; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
-; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
-; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; X64-NEXT: br label [[ENDBLOCK:%.*]]
+; X64: loadbb:
+; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
; X64: loadbb1:
-; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
-; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
-; X64-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4
+; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16*
; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4
-; X64-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]]
+; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 4
; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]]
-; X64-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
+; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]]
; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; X64-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i64
+; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64
-; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
-; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64
+; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]
+; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
; X64: endblock:
-; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; X64-NEXT: ret i32 [[PHI_RES]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
@@ -294,36 +306,37 @@ define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-NEXT: ret i32 [[CALL]]
;
; X64-LABEL: @cmp12(
-; X64-NEXT: loadbb:
-; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
-; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
-; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
-; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
-; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; X64-NEXT: br label [[ENDBLOCK:%.*]]
+; X64: loadbb:
+; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
; X64: loadbb1:
-; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
-; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
-; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2
+; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32*
; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2
-; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X64-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 2
; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]]
-; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X64-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]]
; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64
+; X64-NEXT: [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64
-; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
-; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT: [[TMP19]] = zext i32 [[TMP17]] to i64
+; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]
+; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
; X64: endblock:
-; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; X64-NEXT: ret i32 [[PHI_RES]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
@@ -363,34 +376,35 @@ define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-NEXT: ret i32 [[CALL]]
;
; X64-LABEL: @cmp16(
-; X64-NEXT: loadbb:
-; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
-; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
-; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
-; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ]
-; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; X64-NEXT: br label [[ENDBLOCK:%.*]]
+; X64: loadbb:
+; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
; X64: loadbb1:
-; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i64*
-; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i64*
-; X64-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 1
+; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i64*
+; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i64*
; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1
-; X64-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP11]]
+; X64-NEXT: [[TMP13:%.*]] = getelementptr i64, i64* [[TMP11]], i64 1
; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]]
-; X64-NEXT: [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]])
+; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]]
; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP16]]
-; X64-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
+; X64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
+; X64-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
; X64: endblock:
-; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; X64-NEXT: ret i32 [[PHI_RES]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
@@ -417,22 +431,23 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq3(
-; ALL-NEXT: loadbb:
-; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16*
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16*
-; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]]
-; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]]
-; ALL-NEXT: [[TMP4:%.*]] = icmp ne i16 [[TMP2]], [[TMP3]]
-; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL-NEXT: br label [[LOADBB:%.*]]
; ALL: res_block:
; ALL-NEXT: br label [[ENDBLOCK:%.*]]
+; ALL: loadbb:
+; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]]
+; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]]
+; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
+; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
; ALL: loadbb1:
-; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 2
-; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 2
-; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]]
+; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 2
+; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 2
; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]]
-; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
-; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]]
+; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
+; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
; ALL: endblock:
; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -465,22 +480,23 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq5(
-; ALL-NEXT: loadbb:
-; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
-; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL-NEXT: br label [[LOADBB:%.*]]
; ALL: res_block:
; ALL-NEXT: br label [[ENDBLOCK:%.*]]
+; ALL: loadbb:
+; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]]
+; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
+; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
; ALL: loadbb1:
-; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 4
-; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 4
-; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]]
+; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4
+; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 4
; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]]
-; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
-; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]]
+; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
+; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
; ALL: endblock:
; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -495,24 +511,25 @@ define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq6(
-; ALL-NEXT: loadbb:
-; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
-; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL-NEXT: br label [[LOADBB:%.*]]
; ALL: res_block:
; ALL-NEXT: br label [[ENDBLOCK:%.*]]
+; ALL: loadbb:
+; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]]
+; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
+; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
; ALL: loadbb1:
-; ALL-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16*
-; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16*
-; ALL-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2
+; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16*
+; ALL-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16*
; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2
-; ALL-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]]
+; ALL-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2
; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]]
-; ALL-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]]
-; ALL-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]]
+; ALL-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
+; ALL-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
; ALL: endblock:
; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -540,24 +557,25 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-LABEL: @cmp_eq8(
-; X32-NEXT: loadbb:
-; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
-; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32-NEXT: br label [[LOADBB:%.*]]
; X32: res_block:
; X32-NEXT: br label [[ENDBLOCK:%.*]]
+; X32: loadbb:
+; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]]
+; X32-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
+; X32-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
; X32: loadbb1:
-; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
-; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
-; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32*
; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
-; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 1
; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]]
-; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
-; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]]
+; X32-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
+; X32-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
; X32: endblock:
; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -589,22 +607,23 @@ define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-NEXT: ret i32 [[CONV]]
;
; X64-LABEL: @cmp_eq9(
-; X64-NEXT: loadbb:
-; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
-; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
; X64-NEXT: br label [[ENDBLOCK:%.*]]
+; X64: loadbb:
+; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]]
+; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
+; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
; X64: loadbb1:
-; X64-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 8
-; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 8
-; X64-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]]
+; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8
+; X64-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 8
; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]]
-; X64-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
-; X64-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]]
+; X64-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
+; X64-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
; X64: endblock:
; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -625,24 +644,25 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-NEXT: ret i32 [[CONV]]
;
; X64-LABEL: @cmp_eq10(
-; X64-NEXT: loadbb:
-; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
-; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
; X64-NEXT: br label [[ENDBLOCK:%.*]]
+; X64: loadbb:
+; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]]
+; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
+; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
; X64: loadbb1:
-; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16*
-; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16*
-; X64-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4
+; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16*
; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4
-; X64-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]]
+; X64-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4
; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]]
-; X64-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]]
-; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]]
+; X64-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
+; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
; X64: endblock:
; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -676,24 +696,25 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-NEXT: ret i32 [[CONV]]
;
; X64-LABEL: @cmp_eq12(
-; X64-NEXT: loadbb:
-; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
-; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
; X64-NEXT: br label [[ENDBLOCK:%.*]]
+; X64: loadbb:
+; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]]
+; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
+; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
; X64: loadbb1:
-; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
-; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
-; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2
+; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32*
; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2
-; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X64-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2
; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]]
-; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
-; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]]
+; X64-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
+; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
; X64: endblock:
; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
diff --git a/test/Transforms/IRCE/add-metadata-pre-post-loops.ll b/test/Transforms/IRCE/add-metadata-pre-post-loops.ll
index 488d4b479ba..0225af903ef 100644
--- a/test/Transforms/IRCE/add-metadata-pre-post-loops.ll
+++ b/test/Transforms/IRCE/add-metadata-pre-post-loops.ll
@@ -38,7 +38,7 @@ exit: ; preds = %in.bounds, %entry
define void @single_access_with_preloop(i32 *%arr, i32 *%a_len_ptr, i32 %n, i32 %offset) {
; CHECK-LABEL: @single_access_with_preloop(
; CHECK-LABEL: in.bounds.preloop
-; CHECK: br i1 %14, label %loop.preloop, label %preloop.exit.selector, !llvm.loop !8, !irce.loop.clone !7
+; CHECK: br i1 [[COND:%[^ ]+]], label %loop.preloop, label %preloop.exit.selector, !llvm.loop !8, !irce.loop.clone !7
; CHECK-LABEL: in.bounds.postloop
; CHECK: br i1 %next.postloop, label %loop.postloop, label %exit.loopexit.loopexit, !llvm.loop !9, !irce.loop.clone !7
entry:
diff --git a/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll b/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll
new file mode 100644
index 00000000000..dc6aae8d8aa
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll
@@ -0,0 +1,71 @@
+; RUN: opt %s -indvars -S -o - | FileCheck %s
+source_filename = "/Data/llvm/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.status = type { i32, i8* }
+
+@status = internal unnamed_addr global [32 x %struct.status] zeroinitializer, align 16, !dbg !0
+
+define void @f0() local_unnamed_addr !dbg !20 {
+entry:
+ tail call void @llvm.dbg.value(metadata i32 0, metadata !23, metadata !DIExpression()), !dbg !24
+ br label %for.cond, !dbg !24
+
+for.cond: ; preds = %for.body, %entry
+ ; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+ ; CHECK: call void @llvm.dbg.value(metadata i64 %indvars.iv, metadata !23, metadata !DIExpression()), !dbg !24
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ tail call void @llvm.dbg.value(metadata i32 %i.0, metadata !23, metadata !DIExpression()), !dbg !24
+ %cmp = icmp slt i32 %i.0, 32, !dbg !24
+ br i1 %cmp, label %for.body, label %for.end, !dbg !24
+
+for.body: ; preds = %for.cond
+ %idxprom = sext i32 %i.0 to i64, !dbg !24
+ %value = getelementptr inbounds [32 x %struct.status], [32 x %struct.status]* @status, i64 0, i64 %idxprom, i32 0, !dbg !24
+ store i32 42, i32* %value, align 16, !dbg !24
+ tail call void @use(i32 %i.0), !dbg !24
+ %inc = add nsw i32 %i.0, 1, !dbg !24
+ tail call void @llvm.dbg.value(metadata i32 %inc, metadata !23, metadata !DIExpression()), !dbg !24
+ br label %for.cond, !dbg !24
+
+for.end: ; preds = %for.cond
+ ret void, !dbg !24
+}
+
+declare void @use(i32)
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #0
+
+attributes #0 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!16, !17, !18}
+!llvm.ident = !{!19}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "status", scope: !2, file: !3, line: 5, type: !6, isLocal: true, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 6.0.0 (trunk 316001) (llvm/trunk 316171)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "x.c", directory: "/home/davide/work/llvm/build-release/bin")
+!4 = !{}
+!5 = !{!0}
+!6 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 4096, elements: !14)
+!7 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "status", file: !3, line: 2, size: 128, elements: !8)
+!8 = !{!9, !11}
+!9 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !7, file: !3, line: 3, baseType: !10, size: 32)
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DIDerivedType(tag: DW_TAG_member, name: "p", scope: !7, file: !3, line: 4, baseType: !12, size: 64, offset: 64)
+!12 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !13, size: 64)
+!13 = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char)
+!14 = !{!15}
+!15 = !DISubrange(count: 32)
+!16 = !{i32 2, !"Dwarf Version", i32 4}
+!17 = !{i32 2, !"Debug Info Version", i32 3}
+!18 = !{i32 1, !"wchar_size", i32 4}
+!19 = !{!"clang version 6.0.0 (trunk 316001) (llvm/trunk 316171)"}
+!20 = distinct !DISubprogram(name: "f0", scope: !3, file: !3, line: 6, type: !21, isLocal: false, isDefinition: true, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: true, unit: !2, variables: !22)
+!21 = !DISubroutineType(types: !4)
+!22 = !{!23}
+!23 = !DILocalVariable(name: "i", scope: !20, file: !3, line: 8, type: !10)
+!24 = !DILocation(line: 9, scope: !20)
diff --git a/test/Transforms/InstCombine/debuginfo_add.ll b/test/Transforms/InstCombine/debuginfo_add.ll
new file mode 100644
index 00000000000..0d194cc65c7
--- /dev/null
+++ b/test/Transforms/InstCombine/debuginfo_add.ll
@@ -0,0 +1,108 @@
+; RUN: opt -instcombine %s -o - -S | FileCheck %s
+; typedef struct v *v_t;
+; struct v {
+; unsigned long long p;
+; };
+;
+; void f(v_t object, unsigned long long *start) {
+; unsigned head_size;
+; unsigned long long orig_start;
+; unsigned long long offset;
+; orig_start = *start;
+; for (offset = orig_start - (unsigned long long)(1 << 12); head_size;
+; offset -= (unsigned long long)(1 << 12), head_size -= (1 << 12))
+; use(offset, (object));
+; }
+source_filename = "test.i"
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7s-apple-ios5.0.0"
+
+%struct.vm_object = type { i64 }
+
+; Function Attrs: nounwind ssp
+define void @f(%struct.vm_object* %object, i64* nocapture readonly %start) local_unnamed_addr #0 !dbg !11 {
+entry:
+ tail call void @llvm.dbg.value(metadata %struct.vm_object* %object, metadata !21, metadata !DIExpression()), !dbg !27
+ tail call void @llvm.dbg.value(metadata i64* %start, metadata !22, metadata !DIExpression()), !dbg !28
+ %0 = load i64, i64* %start, align 4, !dbg !29
+ tail call void @llvm.dbg.value(metadata i64 %0, metadata !25, metadata !DIExpression()), !dbg !30
+ %offset.08 = add i64 %0, -4096
+ tail call void @llvm.dbg.value(metadata i64 %offset.08, metadata !26, metadata !DIExpression()), !dbg !31
+ ; CHECK: call void @llvm.dbg.value(metadata i64 %0, metadata !26, metadata !DIExpression(DW_OP_constu, 4096, DW_OP_minus, DW_OP_stack_value)), !dbg !30
+ tail call void @llvm.dbg.value(metadata i32 undef, metadata !23, metadata !DIExpression()), !dbg !32
+ br i1 undef, label %for.end, label %for.body.lr.ph, !dbg !32
+
+for.body.lr.ph: ; preds = %entry
+ br label %for.body, !dbg !32
+
+for.body: ; preds = %for.body.lr.ph, %for.body
+ %offset.010 = phi i64 [ %offset.08, %for.body.lr.ph ], [ %offset.0, %for.body ]
+ %head_size.09 = phi i32 [ undef, %for.body.lr.ph ], [ %sub2, %for.body ]
+ tail call void @llvm.dbg.value(metadata i32 %head_size.09, metadata !23, metadata !DIExpression()), !dbg !31
+ %call = tail call i32 bitcast (i32 (...)* @use to i32 (i64, %struct.vm_object*)*)(i64 %offset.010, %struct.vm_object* %object) #3, !dbg !34
+ %sub2 = add i32 %head_size.09, -4096, !dbg !37
+ %offset.0 = add i64 %offset.010, -4096
+ tail call void @llvm.dbg.value(metadata i64 %offset.0, metadata !26, metadata !DIExpression()), !dbg !30
+ ; CHECK: call void @llvm.dbg.value(metadata i64 %offset.010, metadata !26, metadata !DIExpression(DW_OP_constu, 4096, DW_OP_minus, DW_OP_stack_value)), !dbg !29
+ tail call void @llvm.dbg.value(metadata i32 %sub2, metadata !23, metadata !DIExpression()), !dbg !31
+ %tobool = icmp eq i32 %sub2, 0, !dbg !32
+ br i1 %tobool, label %for.end, label %for.body, !dbg !32, !llvm.loop !38
+
+for.end: ; preds = %for.body, %entry
+ ret void, !dbg !40
+}
+
+declare i32 @use(...) local_unnamed_addr
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+attributes #0 = { nounwind ssp }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!5, !6, !7, !8, !9}
+!llvm.ident = !{!10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 (trunk 317434) (llvm/trunk 317437)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "test.i", directory: "/Data/radar/31209283")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!5 = !{i32 2, !"Dwarf Version", i32 2}
+!6 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = !{i32 1, !"wchar_size", i32 4}
+!8 = !{i32 1, !"min_enum_size", i32 4}
+!9 = !{i32 7, !"PIC Level", i32 2}
+!10 = !{!"clang version 6.0.0 (trunk 317434) (llvm/trunk 317437)"}
+!11 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 6, type: !12, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !20)
+!12 = !DISubroutineType(types: !13)
+!13 = !{null, !14, !19}
+!14 = !DIDerivedType(tag: DW_TAG_typedef, name: "v_t", file: !1, line: 1, baseType: !15)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 32)
+!16 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "v", file: !1, line: 2, size: 64, elements: !17)
+!17 = !{!18}
+!18 = !DIDerivedType(tag: DW_TAG_member, name: "p", scope: !16, file: !1, line: 3, baseType: !4, size: 64)
+!19 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 32)
+!20 = !{!21, !22, !23, !25, !26}
+!21 = !DILocalVariable(name: "object", arg: 1, scope: !11, file: !1, line: 6, type: !14)
+!22 = !DILocalVariable(name: "start", arg: 2, scope: !11, file: !1, line: 6, type: !19)
+!23 = !DILocalVariable(name: "head_size", scope: !11, file: !1, line: 7, type: !24)
+!24 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!25 = !DILocalVariable(name: "orig_start", scope: !11, file: !1, line: 8, type: !4)
+!26 = !DILocalVariable(name: "offset", scope: !11, file: !1, line: 9, type: !4)
+!27 = !DILocation(line: 6, column: 20, scope: !11)
+!28 = !DILocation(line: 6, column: 48, scope: !11)
+!29 = !DILocation(line: 8, column: 22, scope: !11)
+!30 = !DILocation(line: 7, column: 12, scope: !11)
+!31 = !DILocation(line: 10, column: 16, scope: !11)
+!32 = !DILocation(line: 11, column: 5, scope: !33)
+!33 = distinct !DILexicalBlock(scope: !11, file: !1, line: 11, column: 5)
+!34 = !DILocation(line: 13, column: 7, scope: !35)
+!35 = distinct !DILexicalBlock(scope: !36, file: !1, line: 12, column: 75)
+!36 = distinct !DILexicalBlock(scope: !33, file: !1, line: 11, column: 5)
+!37 = !DILocation(line: 12, column: 61, scope: !36)
+!38 = distinct !{!38, !32, !39}
+!39 = !DILocation(line: 14, column: 3, scope: !33)
+!40 = !DILocation(line: 15, column: 1, scope: !11)
diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll
index cbb3d614db2..ba52023e0db 100644
--- a/test/Transforms/InstCombine/shift.ll
+++ b/test/Transforms/InstCombine/shift.ll
@@ -1332,3 +1332,263 @@ define i7 @test65(i7 %a, i7 %b) {
%y = and i7 %x, 1 ; this extracts the lsb which should be 0 because we shifted an even number of bits and all even bits of the shift input are 0.
ret i7 %y
}
+
+define i32 @shl_select_add_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_add_true(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = add i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @shl_select_add_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_add_false(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = add i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @shl_select_and_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_and_true(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = and i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @shl_select_and_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_and_false(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = and i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @lshr_select_and_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_and_true(
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = and i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = lshr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @lshr_select_and_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_and_false(
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = and i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = lshr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @ashr_select_and_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_and_true(
+; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], -1073741821
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = and i32 %x, 2147483655
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = ashr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @ashr_select_and_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_and_false(
+; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], -1073741821
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = and i32 %x, 2147483655
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = ashr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @shl_select_or_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_or_true(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = or i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @shl_select_or_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_or_false(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = or i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @lshr_select_or_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_or_true(
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = or i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = lshr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @lshr_select_or_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_or_false(
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = or i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = lshr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @ashr_select_or_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_or_true(
+; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = or i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = ashr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @ashr_select_or_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_or_false(
+; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = or i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = ashr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @shl_select_xor_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_xor_true(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = xor i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @shl_select_xor_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_xor_false(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = xor i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @lshr_select_xor_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_xor_true(
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = xor i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = lshr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @lshr_select_xor_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_xor_false(
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = xor i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = lshr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @ashr_select_xor_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_xor_true(
+; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = xor i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = ashr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @ashr_select_xor_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_xor_false(
+; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = xor i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = ashr i32 %2, 1
+ ret i32 %3
+}
diff --git a/test/Transforms/LICM/sinking.ll b/test/Transforms/LICM/sinking.ll
index 6e9e8d4b7b6..b28eea0bc2a 100644
--- a/test/Transforms/LICM/sinking.ll
+++ b/test/Transforms/LICM/sinking.ll
@@ -392,6 +392,288 @@ lab60:
indirectbr i8* undef, [label %lab21, label %lab19]
}
-declare void @f(i32*)
+; Check if LICM can sink a sinkable instruction the exit blocks through
+; a non-trivially replacable PHI node.
+;
+; CHECK-LABEL: @test14
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+; CHECK-NOT: sub
+;
+; CHECK-LABEL: Out12.split.loop.exit:
+; CHECK: %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop ]
+; CHECK: %[[MUL:.*]] = mul i32 %N, %[[LCSSAPHI]]
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12.split.loop.exit1:
+; CHECK: %[[LCSSAPHI2:.*]] = phi i32 [ %N_addr.0.pn, %Loop ]
+; CHECK: %[[MUL2:.*]] = mul i32 %N, %[[LCSSAPHI2]]
+; CHECK: %[[SUB:.*]] = sub i32 %[[MUL2]], %N
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12:
+; CHECK: phi i32 [ %[[MUL]], %Out12.split.loop.exit ], [ %[[SUB]], %Out12.split.loop.exit1 ]
+define i32 @test14(i32 %N, i32 %N2, i1 %C) {
+Entry:
+ br label %Loop
+Loop:
+ %N_addr.0.pn = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ]
+ %sink.mul = mul i32 %N, %N_addr.0.pn
+ %sink.sub = sub i32 %sink.mul, %N
+ %dec = add i32 %N_addr.0.pn, -1
+ br i1 %C, label %ContLoop, label %Out12
+ContLoop:
+ %tmp.1 = icmp ne i32 %N_addr.0.pn, 1
+ br i1 %tmp.1, label %Loop, label %Out12
+Out12:
+ %tmp = phi i32 [%sink.mul, %ContLoop], [%sink.sub, %Loop]
+ ret i32 %tmp
+}
+
+; In this test, splitting predecessors is not really required because the
+; operations of sinkable instructions (sub and mul) are same. In this case, we
+; can sink the same sinkable operations and modify the PHI to pass the operands
+; to the shared operations. As of now, we split predecessors of non-trivially
+; replicalbe PHIs by default in LICM because all incoming edges of a
+; non-trivially replacable PHI in LCSSA is critical.
+;
+; CHECK-LABEL: @test15
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+; CHECK-NOT: sub
+;
+; CHECK-LABEL: Out12.split.loop.exit:
+; CHECK: %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop ]
+; CHECK: %[[MUL:.*]] = mul i32 %N, %[[LCSSAPHI]]
+; CHECK: %[[SUB:.*]] = sub i32 %[[MUL]], %N2
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12.split.loop.exit1:
+; CHECK: %[[LCSSAPHI2:.*]] = phi i32 [ %N_addr.0.pn, %Loop ]
+; CHECK: %[[MUL2:.*]] = mul i32 %N, %[[LCSSAPHI2]]
+; CHECK: %[[SUB2:.*]] = sub i32 %[[MUL2]], %N
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12:
+; CHECK: phi i32 [ %[[SUB]], %Out12.split.loop.exit ], [ %[[SUB2]], %Out12.split.loop.exit1 ]
+define i32 @test15(i32 %N, i32 %N2, i1 %C) {
+Entry:
+ br label %Loop
+Loop:
+ %N_addr.0.pn = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ]
+ %sink.mul = mul i32 %N, %N_addr.0.pn
+ %sink.sub = sub i32 %sink.mul, %N
+ %sink.sub2 = sub i32 %sink.mul, %N2
+ %dec = add i32 %N_addr.0.pn, -1
+ br i1 %C, label %ContLoop, label %Out12
+ContLoop:
+ %tmp.1 = icmp ne i32 %N_addr.0.pn, 1
+ br i1 %tmp.1, label %Loop, label %Out12
+Out12:
+ %tmp = phi i32 [%sink.sub2, %ContLoop], [%sink.sub, %Loop]
+ ret i32 %tmp
+}
+
+; Sink through a non-trivially replacable PHI node which use the same sinkable
+; instruction multiple times.
+;
+; CHECK-LABEL: @test16
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+;
+; CHECK-LABEL: Out.split.loop.exit:
+; CHECK: %[[PHI:.*]] = phi i32 [ %l2, %ContLoop ]
+; CHECK: br label %Out
+;
+; CHECK-LABEL: Out.split.loop.exit1:
+; CHECK: %[[SINKABLE:.*]] = mul i32 %l2.lcssa, %t.le
+; CHECK: br label %Out
+;
+; CHECK-LABEL: Out:
+; CHECK: %idx = phi i32 [ %[[PHI]], %Out.split.loop.exit ], [ %[[SINKABLE]], %Out.split.loop.exit1 ]
+define i32 @test16(i1 %c, i8** %P, i32* %P2, i64 %V) {
+entry:
+ br label %loop.ph
+loop.ph:
+ br label %Loop
+Loop:
+ %iv = phi i64 [ 0, %loop.ph ], [ %next, %ContLoop ]
+ %l2 = call i32 @getv()
+ %t = trunc i64 %iv to i32
+ %sinkable = mul i32 %l2, %t
+ switch i32 %l2, label %ContLoop [
+ i32 32, label %Out
+ i32 46, label %Out
+ i32 95, label %Out
+ ]
+ContLoop:
+ %next = add nuw i64 %iv, 1
+ %c1 = call i1 @getc()
+ br i1 %c1, label %Loop, label %Out
+Out:
+ %idx = phi i32 [ %l2, %ContLoop ], [ %sinkable, %Loop ], [ %sinkable, %Loop ], [ %sinkable, %Loop ]
+ ret i32 %idx
+}
+
+; Sink a sinkable instruction through multiple non-trivially replacable PHIs in
+; differect exit blocks.
+;
+; CHECK-LABEL: @test17
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+;
+; CHECK-LABEL:OutA.split.loop.exit{{.*}}:
+; CHECK: %[[OP1:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop1 ]
+; CHECK: %[[SINKABLE:.*]] = mul i32 %N, %[[OP1]]
+; CHECK: br label %OutA
+;
+; CHECK-LABEL:OutA:
+; CHECK: phi i32{{.*}}[ %[[SINKABLE]], %OutA.split.loop.exit{{.*}} ]
+;
+; CHECK-LABEL:OutB.split.loop.exit{{.*}}:
+; CHECK: %[[OP2:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop2 ]
+; CHECK: %[[SINKABLE2:.*]] = mul i32 %N, %[[OP2]]
+; CHECK: br label %OutB
+;
+; CHECK-LABEL:OutB:
+; CHECK: phi i32 {{.*}}[ %[[SINKABLE2]], %OutB.split.loop.exit{{.*}} ]
+define i32 @test17(i32 %N, i32 %N2) {
+Entry:
+ br label %Loop
+Loop:
+ %N_addr.0.pn = phi i32 [ %dec, %ContLoop3 ], [ %N, %Entry ]
+ %sink.mul = mul i32 %N, %N_addr.0.pn
+ %c0 = call i1 @getc()
+ br i1 %c0 , label %ContLoop1, label %OutA
+ContLoop1:
+ %c1 = call i1 @getc()
+ br i1 %c1, label %ContLoop2, label %OutA
+
+ContLoop2:
+ %c2 = call i1 @getc()
+ br i1 %c2, label %ContLoop3, label %OutB
+ContLoop3:
+ %c3 = call i1 @getc()
+ %dec = add i32 %N_addr.0.pn, -1
+ br i1 %c3, label %Loop, label %OutB
+OutA:
+ %tmp1 = phi i32 [%sink.mul, %ContLoop1], [%N2, %Loop]
+ br label %Out12
+OutB:
+ %tmp2 = phi i32 [%sink.mul, %ContLoop2], [%dec, %ContLoop3]
+ br label %Out12
+Out12:
+ %tmp = phi i32 [%tmp1, %OutA], [%tmp2, %OutB]
+ ret i32 %tmp
+}
+
+
+; Sink a sinkable instruction through both trivially and non-trivially replacable PHIs.
+;
+; CHECK-LABEL: @test18
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+; CHECK-NOT: sub
+;
+; CHECK-LABEL:Out12.split.loop.exit:
+; CHECK: %[[OP:.*]] = phi i32 [ %iv, %ContLoop ]
+; CHECK: %[[DEC:.*]] = phi i32 [ %dec, %ContLoop ]
+; CHECK: %[[SINKMUL:.*]] = mul i32 %N, %[[OP]]
+; CHECK: %[[SINKSUB:.*]] = sub i32 %[[SINKMUL]], %N2
+; CHECK: br label %Out12
+;
+; CHECK-LABEL:Out12.split.loop.exit1:
+; CHECK: %[[OP2:.*]] = phi i32 [ %iv, %Loop ]
+; CHECK: %[[SINKMUL2:.*]] = mul i32 %N, %[[OP2]]
+; CHECK: %[[SINKSUB2:.*]] = sub i32 %[[SINKMUL2]], %N2
+; CHECK: br label %Out12
+;
+; CHECK-LABEL:Out12:
+; CHECK: %tmp1 = phi i32 [ %[[SINKSUB]], %Out12.split.loop.exit ], [ %[[SINKSUB2]], %Out12.split.loop.exit1 ]
+; CHECK: %tmp2 = phi i32 [ %[[DEC]], %Out12.split.loop.exit ], [ %[[SINKSUB2]], %Out12.split.loop.exit1 ]
+; CHECK: %add = add i32 %tmp1, %tmp2
+define i32 @test18(i32 %N, i32 %N2) {
+Entry:
+ br label %Loop
+Loop:
+ %iv = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ]
+ %sink.mul = mul i32 %N, %iv
+ %sink.sub = sub i32 %sink.mul, %N2
+ %c0 = call i1 @getc()
+ br i1 %c0, label %ContLoop, label %Out12
+ContLoop:
+ %dec = add i32 %iv, -1
+ %c1 = call i1 @getc()
+ br i1 %c1, label %Loop, label %Out12
+Out12:
+ %tmp1 = phi i32 [%sink.sub, %ContLoop], [%sink.sub, %Loop]
+ %tmp2 = phi i32 [%dec, %ContLoop], [%sink.sub, %Loop]
+ %add = add i32 %tmp1, %tmp2
+ ret i32 %add
+}
+
+; Do not sink an instruction through a non-trivially replacable PHI, to avoid
+; assert while splitting predecessors, if the terminator of predecessor is an
+; indirectbr.
+; CHECK-LABEL: @test19
+; CHECK-LABEL: L0:
+; CHECK: %sinkable = mul
+; CHECK: %sinkable2 = add
+
+define i32 @test19(i1 %cond, i1 %cond2, i8* %address, i32 %v1) nounwind {
+entry:
+ br label %L0
+L0:
+ %indirect.goto.dest = select i1 %cond, i8* blockaddress(@test19, %exit), i8* %address
+ %v2 = call i32 @getv()
+ %sinkable = mul i32 %v1, %v2
+ %sinkable2 = add i32 %v1, %v2
+ indirectbr i8* %indirect.goto.dest, [label %L1, label %exit]
+
+L1:
+ %indirect.goto.dest2 = select i1 %cond2, i8* blockaddress(@test19, %exit), i8* %address
+ indirectbr i8* %indirect.goto.dest2, [label %L0, label %exit]
+
+exit:
+ %r = phi i32 [%sinkable, %L0], [%sinkable2, %L1]
+ ret i32 %r
+}
+
+; Do not sink through a non-trivially replacable PHI if splitting predecessors
+; not allowed in SplitBlockPredecessors().
+;
+; CHECK-LABEL: @test20
+; CHECK-LABEL: while.cond
+; CHECK: %sinkable = mul
+; CHECK: %sinkable2 = add
+define void @test20(i32* %s, i1 %b, i32 %v1, i32 %v2) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+ br label %while.cond
+while.cond:
+ %v = call i32 @getv()
+ %sinkable = mul i32 %v, %v2
+ %sinkable2 = add i32 %v, %v2
+ br i1 %b, label %try.cont, label %while.body
+while.body:
+ invoke void @may_throw()
+ to label %while.body2 unwind label %catch.dispatch
+while.body2:
+ invoke void @may_throw2()
+ to label %while.cond unwind label %catch.dispatch
+catch.dispatch:
+ %.lcssa1 = phi i32 [ %sinkable, %while.body ], [ %sinkable2, %while.body2 ]
+ %cp = cleanuppad within none []
+ store i32 %.lcssa1, i32* %s
+ cleanupret from %cp unwind to caller
+try.cont:
+ ret void
+}
+
+declare void @may_throw()
+declare void @may_throw2()
+declare i32 @__CxxFrameHandler3(...)
+declare i32 @getv()
+declare i1 @getc()
+declare void @f(i32*)
declare void @g()
diff --git a/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll b/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
new file mode 100644
index 00000000000..3c283dcb6e5
--- /dev/null
+++ b/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
@@ -0,0 +1,46 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S < %s | \
+; RUN: FileCheck %s
+;
+; The GPU Load & Store Vectorizer may merge differently-typed accesses into a
+; single instruction. This test checks that we merge TBAA tags for such
+; accesses correctly.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; struct S {
+; float f;
+; int i;
+; };
+%struct.S = type { float, i32 }
+
+; float foo(S *p) {
+; p->f -= 1;
+; p->i -= 1;
+; return p->f;
+; }
+define float @foo(%struct.S* %p) {
+entry:
+; CHECK-LABEL: foo
+; CHECK: load <2 x i32>, {{.*}}, !tbaa [[TAG_char:!.*]]
+; CHECK: store <2 x i32> {{.*}}, !tbaa [[TAG_char]]
+ %f = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 0
+ %0 = load float, float* %f, align 4, !tbaa !2
+ %sub = fadd float %0, -1.000000e+00
+ store float %sub, float* %f, align 4, !tbaa !2
+ %i = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 1
+ %1 = load i32, i32* %i, align 4, !tbaa !8
+ %sub1 = add nsw i32 %1, -1
+ store i32 %sub1, i32* %i, align 4, !tbaa !8
+ ret float %sub
+}
+
+!2 = !{!3, !4, i64 0}
+!3 = !{!"_ZTS1S", !4, i64 0, !7, i64 4}
+!4 = !{!"float", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C++ TBAA"}
+!7 = !{!"int", !5, i64 0}
+!8 = !{!3, !7, i64 4}
+
+; CHECK-DAG: [[TYPE_char:!.*]] = !{!"omnipotent char", {{.*}}, i64 0}
+; CHECK-FAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0}
diff --git a/test/Transforms/LoopPredication/widened.ll b/test/Transforms/LoopPredication/widened.ll
new file mode 100644
index 00000000000..33c4e270613
--- /dev/null
+++ b/test/Transforms/LoopPredication/widened.ll
@@ -0,0 +1,138 @@
+; RUN: opt -S -loop-predication -loop-predication-enable-iv-truncation=true < %s 2>&1 | FileCheck %s
+declare void @llvm.experimental.guard(i1, ...)
+
+declare i32 @length(i8*)
+
+declare i16 @short_length(i8*)
+; Consider range check of type i16 and i32, while IV is of type i64
+; We can loop predicate this because the IV range is within i16 and within i32.
+define i64 @iv_wider_type_rc_two_narrow_types(i32 %offA, i16 %offB, i8* %arrA, i8* %arrB) {
+; CHECK-LABEL: iv_wider_type_rc_two_narrow_types
+entry:
+; CHECK-LABEL: entry:
+; CHECK: [[idxB:[^ ]+]] = sub i16 %lengthB, %offB
+; CHECK-NEXT: [[limit_checkB:[^ ]+]] = icmp ule i16 16, [[idxB]]
+; CHECK-NEXT: [[first_iteration_checkB:[^ ]+]] = icmp ult i16 %offB, %lengthB
+; CHECK-NEXT: [[WideChkB:[^ ]+]] = and i1 [[first_iteration_checkB]], [[limit_checkB]]
+; CHECK-NEXT: [[idxA:[^ ]+]] = sub i32 %lengthA, %offA
+; CHECK-NEXT: [[limit_checkA:[^ ]+]] = icmp ule i32 16, [[idxA]]
+; CHECK-NEXT: [[first_iteration_checkA:[^ ]+]] = icmp ult i32 %offA, %lengthA
+; CHECK-NEXT: [[WideChkA:[^ ]+]] = and i1 [[first_iteration_checkA]], [[limit_checkA]]
+ %lengthA = call i32 @length(i8* %arrA)
+ %lengthB = call i16 @short_length(i8* %arrB)
+ br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK: [[invariant_check:[^ ]+]] = and i1 [[WideChkB]], [[WideChkA]]
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[invariant_check]], i32 9)
+ %iv = phi i64 [0, %entry ], [ %iv.next, %loop ]
+ %iv.trunc.32 = trunc i64 %iv to i32
+ %iv.trunc.16 = trunc i64 %iv to i16
+ %indexA = add i32 %iv.trunc.32, %offA
+ %indexB = add i16 %iv.trunc.16, %offB
+ %rcA = icmp ult i32 %indexA, %lengthA
+ %rcB = icmp ult i16 %indexB, %lengthB
+ %wide.chk = and i1 %rcA, %rcB
+ call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk, i32 9) [ "deopt"() ]
+ %indexA.ext = zext i32 %indexA to i64
+ %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext
+ %eltA = load i8, i8* %addrA
+ %indexB.ext = zext i16 %indexB to i64
+ %addrB = getelementptr inbounds i8, i8* %arrB, i64 %indexB.ext
+ store i8 %eltA, i8* %addrB
+ %iv.next = add nuw nsw i64 %iv, 1
+ %latch.check = icmp ult i64 %iv.next, 16
+ br i1 %latch.check, label %loop, label %exit
+
+exit:
+ ret i64 %iv
+}
+
+
+; Consider an IV of type long and an array access into int array.
+; IV is of type i64 while the range check operands are of type i32 and i64.
+define i64 @iv_rc_different_types(i32 %offA, i32 %offB, i8* %arrA, i8* %arrB, i64 %max)
+{
+; CHECK-LABEL: iv_rc_different_types
+entry:
+; CHECK-LABEL: entry:
+; CHECK: [[lenB:[^ ]+]] = add i32 %lengthB, -1
+; CHECK-NEXT: [[idxB:[^ ]+]] = sub i32 [[lenB]], %offB
+; CHECK-NEXT: [[limit_checkB:[^ ]+]] = icmp ule i32 15, [[idxB]]
+; CHECK-NEXT: [[first_iteration_checkB:[^ ]+]] = icmp ult i32 %offB, %lengthB
+; CHECK-NEXT: [[WideChkB:[^ ]+]] = and i1 [[first_iteration_checkB]], [[limit_checkB]]
+; CHECK-NEXT: [[maxMinusOne:[^ ]+]] = add i64 %max, -1
+; CHECK-NEXT: [[limit_checkMax:[^ ]+]] = icmp ule i64 15, [[maxMinusOne]]
+; CHECK-NEXT: [[first_iteration_checkMax:[^ ]+]] = icmp ult i64 0, %max
+; CHECK-NEXT: [[WideChkMax:[^ ]+]] = and i1 [[first_iteration_checkMax]], [[limit_checkMax]]
+; CHECK-NEXT: [[lenA:[^ ]+]] = add i32 %lengthA, -1
+; CHECK-NEXT: [[idxA:[^ ]+]] = sub i32 [[lenA]], %offA
+; CHECK-NEXT: [[limit_checkA:[^ ]+]] = icmp ule i32 15, [[idxA]]
+; CHECK-NEXT: [[first_iteration_checkA:[^ ]+]] = icmp ult i32 %offA, %lengthA
+; CHECK-NEXT: [[WideChkA:[^ ]+]] = and i1 [[first_iteration_checkA]], [[limit_checkA]]
+ %lengthA = call i32 @length(i8* %arrA)
+ %lengthB = call i32 @length(i8* %arrB)
+ br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK: [[BandMax:[^ ]+]] = and i1 [[WideChkB]], [[WideChkMax]]
+; CHECK: [[ABandMax:[^ ]+]] = and i1 [[BandMax]], [[WideChkA]]
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[ABandMax]], i32 9)
+ %iv = phi i64 [0, %entry ], [ %iv.next, %loop ]
+ %iv.trunc = trunc i64 %iv to i32
+ %indexA = add i32 %iv.trunc, %offA
+ %indexB = add i32 %iv.trunc, %offB
+ %rcA = icmp ult i32 %indexA, %lengthA
+ %rcIV = icmp ult i64 %iv, %max
+ %wide.chk = and i1 %rcA, %rcIV
+ %rcB = icmp ult i32 %indexB, %lengthB
+ %wide.chk.final = and i1 %wide.chk, %rcB
+ call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk.final, i32 9) [ "deopt"() ]
+ %indexA.ext = zext i32 %indexA to i64
+ %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext
+ %eltA = load i8, i8* %addrA
+ %indexB.ext = zext i32 %indexB to i64
+ %addrB = getelementptr inbounds i8, i8* %arrB, i64 %indexB.ext
+ %eltB = load i8, i8* %addrB
+ %result = xor i8 %eltA, %eltB
+ store i8 %result, i8* %addrA
+ %iv.next = add nuw nsw i64 %iv, 1
+ %latch.check = icmp ult i64 %iv, 15
+ br i1 %latch.check, label %loop, label %exit
+
+exit:
+ ret i64 %iv
+}
+
+; cannot narrow the IV to the range type, because we lose information.
+; for (i64 i= 5; i>= 2; i++)
+; this loop wraps around after reaching 2^64.
+define i64 @iv_rc_different_type(i32 %offA, i8* %arrA) {
+; CHECK-LABEL: iv_rc_different_type
+entry:
+ %lengthA = call i32 @length(i8* %arrA)
+ br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK: %rcA = icmp ult i32 %indexA, %lengthA
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %rcA, i32 9)
+ %iv = phi i64 [ 5, %entry ], [ %iv.next, %loop ]
+ %iv.trunc.32 = trunc i64 %iv to i32
+ %indexA = add i32 %iv.trunc.32, %offA
+ %rcA = icmp ult i32 %indexA, %lengthA
+ call void (i1, ...) @llvm.experimental.guard(i1 %rcA, i32 9) [ "deopt"() ]
+ %indexA.ext = zext i32 %indexA to i64
+ %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext
+ %eltA = load i8, i8* %addrA
+ %res = add i8 %eltA, 2
+ store i8 %eltA, i8* %addrA
+ %iv.next = add i64 %iv, 1
+ %latch.check = icmp sge i64 %iv.next, 2
+ br i1 %latch.check, label %loop, label %exit
+
+exit:
+ ret i64 %iv
+}
diff --git a/test/Transforms/LoopVectorize/pr34681.ll b/test/Transforms/LoopVectorize/pr34681.ll
new file mode 100644
index 00000000000..e93265e2ed5
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr34681.ll
@@ -0,0 +1,122 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check the scenario where we have an unknown Stride, which happens to also be
+; the loop iteration count, so if we specialize the loop for the Stride==1 case,
+; this also implies that the loop will iterate no more than a single iteration,
+; as in the following example:
+;
+; unsigned int N;
+; int tmp = 0;
+; for(unsigned int k=0;k<N;k++) {
+; tmp+=(int)B[k*N+j];
+; }
+;
+; We check here that the following runtime scev guard for Stride==1 is NOT generated:
+; vector.scevcheck:
+; %ident.check = icmp ne i32 %N, 1
+; %0 = or i1 false, %ident.check
+; br i1 %0, label %scalar.ph, label %vector.ph
+; Instead the loop is vectorized with an unknown stride.
+
+; CHECK-LABEL: @foo1
+; CHECK: for.body.lr.ph
+; CHECK-NOT: %ident.check = icmp ne i32 %N, 1
+; CHECK-NOT: %[[TEST:[0-9]+]] = or i1 false, %ident.check
+; CHECK-NOT: br i1 %[[TEST]], label %scalar.ph, label %vector.ph
+; CHECK: vector.ph
+; CHECK: vector.body
+; CHECK: <4 x i32>
+; CHECK: middle.block
+; CHECK: scalar.ph
+
+
+define i32 @foo1(i32 %N, i16* nocapture readnone %A, i16* nocapture readonly %B, i32 %i, i32 %j) {
+entry:
+ %cmp8 = icmp eq i32 %N, 0
+ br i1 %cmp8, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+ br label %for.body
+
+for.body:
+ %tmp.010 = phi i32 [ 0, %for.body.lr.ph ], [ %add1, %for.body ]
+ %k.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+ %mul = mul i32 %k.09, %N
+ %add = add i32 %mul, %j
+ %arrayidx = getelementptr inbounds i16, i16* %B, i32 %add
+ %0 = load i16, i16* %arrayidx, align 2
+ %conv = sext i16 %0 to i32
+ %add1 = add nsw i32 %tmp.010, %conv
+ %inc = add nuw i32 %k.09, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ %add1.lcssa = phi i32 [ %add1, %for.body ]
+ br label %for.end
+
+for.end:
+ %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add1.lcssa, %for.end.loopexit ]
+ ret i32 %tmp.0.lcssa
+}
+
+
+; Check the same, but also where the Stride and the loop iteration count
+; are not of the same data type.
+;
+; unsigned short N;
+; int tmp = 0;
+; for(unsigned int k=0;k<N;k++) {
+; tmp+=(int)B[k*N+j];
+; }
+;
+; We check here that the following runtime scev guard for Stride==1 is NOT generated:
+; vector.scevcheck:
+; %ident.check = icmp ne i16 %N, 1
+; %0 = or i1 false, %ident.check
+; br i1 %0, label %scalar.ph, label %vector.ph
+
+
+; CHECK-LABEL: @foo2
+; CHECK: for.body.lr.ph
+; CHECK-NOT: %ident.check = icmp ne i16 %N, 1
+; CHECK-NOT: %[[TEST:[0-9]+]] = or i1 false, %ident.check
+; CHECK-NOT: br i1 %[[TEST]], label %scalar.ph, label %vector.ph
+; CHECK: vector.ph
+; CHECK: vector.body
+; CHECK: <4 x i32>
+; CHECK: middle.block
+; CHECK: scalar.ph
+
+define i32 @foo2(i16 zeroext %N, i16* nocapture readnone %A, i16* nocapture readonly %B, i32 %i, i32 %j) {
+entry:
+ %conv = zext i16 %N to i32
+ %cmp11 = icmp eq i16 %N, 0
+ br i1 %cmp11, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+ br label %for.body
+
+for.body:
+ %tmp.013 = phi i32 [ 0, %for.body.lr.ph ], [ %add4, %for.body ]
+ %k.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+ %mul = mul nuw i32 %k.012, %conv
+ %add = add i32 %mul, %j
+ %arrayidx = getelementptr inbounds i16, i16* %B, i32 %add
+ %0 = load i16, i16* %arrayidx, align 2
+ %conv3 = sext i16 %0 to i32
+ %add4 = add nsw i32 %tmp.013, %conv3
+ %inc = add nuw nsw i32 %k.012, 1
+ %exitcond = icmp eq i32 %inc, %conv
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ %add4.lcssa = phi i32 [ %add4, %for.body ]
+ br label %for.end
+
+for.end:
+ %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add4.lcssa, %for.end.loopexit ]
+ ret i32 %tmp.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/version-mem-access.ll b/test/Transforms/LoopVectorize/version-mem-access.ll
index a9d319e5a2d..774b6f26859 100644
--- a/test/Transforms/LoopVectorize/version-mem-access.ll
+++ b/test/Transforms/LoopVectorize/version-mem-access.ll
@@ -65,7 +65,8 @@ for.end:
define void @fn1(double* noalias %x, double* noalias %c, double %a) {
entry:
%conv = fptosi double %a to i32
- %cmp8 = icmp sgt i32 %conv, 0
+ %conv2 = add i32 %conv, 4
+ %cmp8 = icmp sgt i32 %conv2, 0
br i1 %cmp8, label %for.body.preheader, label %for.end
for.body.preheader:
@@ -82,7 +83,7 @@ for.body:
store double %1, double* %arrayidx3, align 8
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
- %exitcond = icmp eq i32 %lftr.wideiv, %conv
+ %exitcond = icmp eq i32 %lftr.wideiv, %conv2
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit:
diff --git a/test/Transforms/LowerTypeTests/blockaddress.ll b/test/Transforms/LowerTypeTests/blockaddress.ll
new file mode 100644
index 00000000000..ecc4814cfd5
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/blockaddress.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S %s -lowertypetests | FileCheck %s
+
+
+; CHECK: define internal i8* @f2.cfi() !type !0 {
+; CHECK-NEXT: br label %b
+; CHECK: b:
+; CHECK-NEXT: ret i8* blockaddress(@f2.cfi, %b)
+; CHECK-NEXT: }
+
+target triple = "x86_64-unknown-linux"
+
+define void @f1() {
+entry:
+ %0 = call i1 @llvm.type.test(i8* bitcast (i8* ()* @f2 to i8*), metadata !"_ZTSFvP3bioE")
+ ret void
+}
+
+declare i1 @llvm.type.test(i8*, metadata)
+
+define i8* @f2() !type !5 {
+ br label %b
+
+b:
+ ret i8* blockaddress(@f2, %b)
+}
+
+!5 = !{i64 0, !"_ZTSFvP3bioE"}
diff --git a/test/Transforms/LowerTypeTests/import-unsat.ll b/test/Transforms/LowerTypeTests/import-unsat.ll
index 6cb9b26fb57..b9eb552dd66 100644
--- a/test/Transforms/LowerTypeTests/import-unsat.ll
+++ b/test/Transforms/LowerTypeTests/import-unsat.ll
@@ -7,6 +7,7 @@
; SUMMARY-NEXT: - Linkage: 0
; SUMMARY-NEXT: NotEligibleToImport: false
; SUMMARY-NEXT: Live: true
+; SUMMARY-NEXT: Local: false
; SUMMARY-NEXT: TypeTests: [ 123 ]
; SUMMARY-NEXT: TypeIdMap:
; SUMMARY-NEXT: typeid1:
diff --git a/test/Transforms/PGOProfile/Inputs/irreducible.proftext b/test/Transforms/PGOProfile/Inputs/irreducible.proftext
new file mode 100644
index 00000000000..9b0210d9a30
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/irreducible.proftext
@@ -0,0 +1,29 @@
+:ir
+_Z11irreducibleii
+# Func Hash:
+64451410787
+# Num Counters:
+6
+# Counter Values:
+1000
+950
+100
+373
+1
+0
+
+_Z11irreduciblePh
+# Func Hash:
+104649601521
+# Num Counters:
+9
+# Counter Values:
+100
+300
+99
+300
+201
+1
+1
+0
+0
diff --git a/test/Transforms/PGOProfile/irreducible.ll b/test/Transforms/PGOProfile/irreducible.ll
new file mode 100644
index 00000000000..37f6e206ee9
--- /dev/null
+++ b/test/Transforms/PGOProfile/irreducible.ll
@@ -0,0 +1,184 @@
+; RUN: llvm-profdata merge %S/Inputs/irreducible.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
+
+; GEN: $__llvm_profile_raw_version = comdat any
+
+; Function Attrs: noinline norecurse nounwind readnone uwtable
+define i32 @_Z11irreducibleii(i32 %iter_outer, i32 %iter_inner) local_unnamed_addr #0 {
+entry:
+ %cmp24 = icmp sgt i32 %iter_outer, 0
+ br i1 %cmp24, label %for.body, label %entry.for.cond.cleanup_crit_edge
+
+entry.for.cond.cleanup_crit_edge: ; preds = %entry
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %entry.for.cond.cleanup_crit_edge, %for.end
+ %sum.0.lcssa = phi i32 [ 0, %entry.for.cond.cleanup_crit_edge ], [ %sum.1, %for.end ]
+ ret i32 %sum.0.lcssa
+
+for.body: ; preds = %entry, %for.end
+ %k.026 = phi i32 [ %inc12, %for.end ], [ 0, %entry ]
+ %sum.025 = phi i32 [ %sum.1, %for.end ], [ 0, %entry ]
+ %rem23 = and i32 %k.026, 1
+ %cmp1 = icmp eq i32 %rem23, 0
+ br i1 %cmp1, label %entry8, label %for.cond2
+
+for.cond2: ; preds = %for.body, %if.end9
+ %sum.1 = phi i32 [ %add10, %if.end9 ], [ %sum.025, %for.body ]
+ %i.0 = phi i32 [ %inc, %if.end9 ], [ 0, %for.body ]
+ %cmp3 = icmp slt i32 %i.0, %iter_inner
+ br i1 %cmp3, label %for.body4, label %for.end
+; USE: br i1 %cmp3, label %for.body4, label %for.end, !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[FOR_COND2_IRR_LOOP:[0-9]+]]
+
+for.body4: ; preds = %for.cond2
+ %rem5 = srem i32 %k.026, 3
+ %cmp6 = icmp eq i32 %rem5, 0
+ br i1 %cmp6, label %entry8, label %if.end9
+
+entry8: ; preds = %for.body4, %for.body
+ %sum.2 = phi i32 [ %sum.025, %for.body ], [ %sum.1, %for.body4 ]
+ %i.1 = phi i32 [ 0, %for.body ], [ %i.0, %for.body4 ]
+ %add = add nsw i32 %sum.2, 4
+ br label %if.end9
+; USE: br label %if.end9,
+; USE-SAME: !irr_loop ![[ENTRY8_IRR_LOOP:[0-9]+]]
+
+if.end9: ; preds = %entry8, %for.body4
+ %sum.3 = phi i32 [ %add, %entry8 ], [ %sum.1, %for.body4 ]
+ %i.2 = phi i32 [ %i.1, %entry8 ], [ %i.0, %for.body4 ]
+ %add10 = add nsw i32 %sum.3, 1
+ %inc = add nsw i32 %i.2, 1
+ br label %for.cond2
+; USE: br label %for.cond2,
+; USE-SAME: !irr_loop ![[IF_END9_IRR_LOOP:[0-9]+]]
+
+for.end: ; preds = %for.cond2
+ %inc12 = add nuw nsw i32 %k.026, 1
+ %exitcond = icmp eq i32 %inc12, %iter_outer
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+
+
+@targets = local_unnamed_addr global [256 x i8*] zeroinitializer, align 16
+@tracing = local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: noinline norecurse nounwind uwtable
+define i32 @_Z11irreduciblePh(i8* nocapture readonly %p) {
+entry:
+ store <2 x i8*> <i8* blockaddress(@_Z11irreduciblePh, %sw.bb), i8* blockaddress(@_Z11irreduciblePh, %TARGET_1)>, <2 x i8*>* bitcast ([256 x i8*]* @targets to <2 x i8*>*), align 16
+ store i8* blockaddress(@_Z11irreduciblePh, %TARGET_2), i8** getelementptr inbounds ([256 x i8*], [256 x i8*]* @targets, i64 0, i64 2), align 16
+ %0 = load i32, i32* @tracing, align 4
+ %tobool = icmp eq i32 %0, 0
+ br label %for.cond1
+
+for.cond1: ; preds = %sw.default, %entry
+ %p.addr.0 = phi i8* [ %p, %entry ], [ %p.addr.4, %sw.default ]
+ %sum.0 = phi i32 [ 0, %entry ], [ %add25, %sw.default ]
+ %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1
+ %1 = load i8, i8* %p.addr.0, align 1
+ %incdec.ptr2 = getelementptr inbounds i8, i8* %p.addr.0, i64 2
+ %2 = load i8, i8* %incdec.ptr, align 1
+ %conv3 = zext i8 %2 to i32
+ br label %dispatch_op
+
+dispatch_op: ; preds = %sw.bb6, %for.cond1
+ %p.addr.1 = phi i8* [ %incdec.ptr2, %for.cond1 ], [ %p.addr.2, %sw.bb6 ]
+ %op.0 = phi i8 [ %1, %for.cond1 ], [ 1, %sw.bb6 ]
+ %oparg.0 = phi i32 [ %conv3, %for.cond1 ], [ %oparg.2, %sw.bb6 ]
+ %sum.1 = phi i32 [ %sum.0, %for.cond1 ], [ %add7, %sw.bb6 ]
+ switch i8 %op.0, label %sw.default [
+ i8 0, label %sw.bb
+ i8 1, label %dispatch_op.sw.bb6_crit_edge
+ i8 2, label %sw.bb15
+ ]
+
+dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op
+ br label %sw.bb6
+
+sw.bb: ; preds = %indirectgoto, %dispatch_op
+ %oparg.1 = phi i32 [ %oparg.0, %dispatch_op ], [ 0, %indirectgoto ]
+ %sum.2 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %indirectgoto ]
+ %add.neg = sub i32 -5, %oparg.1
+ %sub = add i32 %add.neg, %sum.2
+ br label %exit
+
+TARGET_1: ; preds = %indirectgoto
+ %incdec.ptr4 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
+ %3 = load i8, i8* %p.addr.5, align 1
+ %conv5 = zext i8 %3 to i32
+ br label %sw.bb6
+
+sw.bb6: ; preds = %dispatch_op.sw.bb6_crit_edge, %TARGET_1
+ %p.addr.2 = phi i8* [ %incdec.ptr4, %TARGET_1 ], [ %p.addr.1, %dispatch_op.sw.bb6_crit_edge ]
+ %oparg.2 = phi i32 [ %conv5, %TARGET_1 ], [ %oparg.0, %dispatch_op.sw.bb6_crit_edge ]
+ %sum.3 = phi i32 [ %sum.7, %TARGET_1 ], [ %sum.1, %dispatch_op.sw.bb6_crit_edge ]
+ %mul = mul nsw i32 %oparg.2, 7
+ %add7 = add nsw i32 %sum.3, %mul
+ %rem46 = and i32 %add7, 1
+ %cmp8 = icmp eq i32 %rem46, 0
+ br i1 %cmp8, label %dispatch_op, label %if.then
+; USE: br i1 %cmp8, label %dispatch_op, label %if.then, !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[SW_BB6_IRR_LOOP:[0-9]+]]
+
+if.then: ; preds = %sw.bb6
+ %mul9 = mul nsw i32 %add7, 9
+ br label %indirectgoto
+
+TARGET_2: ; preds = %indirectgoto
+ %incdec.ptr13 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
+ %4 = load i8, i8* %p.addr.5, align 1
+ %conv14 = zext i8 %4 to i32
+ br label %sw.bb15
+
+sw.bb15: ; preds = %TARGET_2, %dispatch_op
+ %p.addr.3 = phi i8* [ %p.addr.1, %dispatch_op ], [ %incdec.ptr13, %TARGET_2 ]
+ %oparg.3 = phi i32 [ %oparg.0, %dispatch_op ], [ %conv14, %TARGET_2 ]
+ %sum.4 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %TARGET_2 ]
+ %add16 = add nsw i32 %oparg.3, 3
+ %add17 = add nsw i32 %add16, %sum.4
+ br i1 %tobool, label %if.then18, label %exit
+; USE: br i1 %tobool, label %if.then18, label %exit, !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[SW_BB15_IRR_LOOP:[0-9]+]]
+
+if.then18: ; preds = %sw.bb15
+ %idx.ext = sext i32 %oparg.3 to i64
+ %add.ptr = getelementptr inbounds i8, i8* %p.addr.3, i64 %idx.ext
+ %mul19 = mul nsw i32 %add17, 17
+ br label %indirectgoto
+
+unknown_op: ; preds = %indirectgoto
+ %sub24 = add nsw i32 %sum.7, -4
+ br label %sw.default
+
+sw.default: ; preds = %unknown_op, %dispatch_op
+ %p.addr.4 = phi i8* [ %p.addr.5, %unknown_op ], [ %p.addr.1, %dispatch_op ]
+ %sum.5 = phi i32 [ %sub24, %unknown_op ], [ %sum.1, %dispatch_op ]
+ %add25 = add nsw i32 %sum.5, 11
+ br label %for.cond1
+
+exit: ; preds = %sw.bb15, %sw.bb
+ %sum.6 = phi i32 [ %sub, %sw.bb ], [ %add17, %sw.bb15 ]
+ ret i32 %sum.6
+
+indirectgoto: ; preds = %if.then18, %if.then
+ %add.ptr.pn = phi i8* [ %add.ptr, %if.then18 ], [ %p.addr.2, %if.then ]
+ %sum.7 = phi i32 [ %mul19, %if.then18 ], [ %mul9, %if.then ]
+ %p.addr.5 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 1
+ %5 = load i8, i8* %add.ptr.pn, align 1
+ %idxprom21 = zext i8 %5 to i64
+ %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21
+ %6 = load i8*, i8** %arrayidx22, align 8
+ indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2]
+; USE: indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[INDIRECTGOTO_IRR_LOOP:[0-9]+]]
+}
+
+; USE: ![[FOR_COND2_IRR_LOOP]] = !{!"loop_header_weight", i64 1050}
+; USE: ![[ENTRY8_IRR_LOOP]] = !{!"loop_header_weight", i64 373}
+; USE: ![[IF_END9_IRR_LOOP]] = !{!"loop_header_weight", i64 1000}
+; USE: ![[SW_BB6_IRR_LOOP]] = !{!"loop_header_weight", i64 501}
+; USE: ![[SW_BB15_IRR_LOOP]] = !{!"loop_header_weight", i64 100}
+; USE: ![[INDIRECTGOTO_IRR_LOOP]] = !{!"loop_header_weight", i64 400}
diff --git a/test/Transforms/PGOProfile/thinlto_samplepgo_icp2.ll b/test/Transforms/PGOProfile/thinlto_samplepgo_icp2.ll
index c1c074e75a7..1751854d448 100644
--- a/test/Transforms/PGOProfile/thinlto_samplepgo_icp2.ll
+++ b/test/Transforms/PGOProfile/thinlto_samplepgo_icp2.ll
@@ -22,7 +22,7 @@
; RUN: llvm-nm %t3.2 | FileCheck %s --check-prefix=NM
; NM: _ZL3barv
; RUN: llvm-dis < %t3.2.2.internalize.bc | FileCheck %s --check-prefix=INTERNALIZE
-; INTERNALIZE: define void @_ZL3barv
+; INTERNALIZE: define dso_local void @_ZL3barv
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll
index 105afa9def5..ebc15865a67 100644
--- a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll
+++ b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll
@@ -75,6 +75,54 @@ define void @test_dereferenceable(i32 addrspace(1)* addrspace(1)* %p, i32 %x, i3
ret void
}
+; invariant.start allows us to sink the load past the baz statepoint call into taken block, which is
+; incorrect. remove the invariant.start and RAUW undef.
+define void @test_inv_start(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" {
+; CHECK-LABEL: test_inv_start
+; CHECK-NOT: invariant.start
+; CHECK: gc.statepoint
+ %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p
+ %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1)
+ %v2 = load i32, i32 addrspace(1)* %v1
+ call void @baz(i32 %x)
+ br i1 %cond, label %taken, label %untaken
+
+taken:
+ store i32 %v2, i32 addrspace(1)* %q, align 16
+ call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1)
+ ret void
+
+; CHECK-LABEL: untaken:
+; CHECK: gc.statepoint
+untaken:
+ %foo = call i32 @escaping.invariant.start({}* %invst)
+ call void @dummy(i32 %foo)
+ ret void
+}
+
+; invariant.start is removed and the uses are undef'ed.
+define void @test_inv_start2(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" {
+; CHECK-LABEL: test_inv_start2
+; CHECK-NOT: invariant.start
+; CHECK: gc.statepoint
+ %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p
+ %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1)
+ %v2 = load i32, i32 addrspace(1)* %v1
+ call void @baz(i32 %x)
+ br i1 %cond, label %taken, label %untaken
+
+taken:
+ store i32 %v2, i32 addrspace(1)* %q, align 16
+ call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1)
+ ret void
+
+untaken:
+ ret void
+}
+declare {}* @llvm.invariant.start.p1i32(i64, i32 addrspace(1)* nocapture) nounwind readonly
+declare void @llvm.invariant.end.p1i32({}*, i64, i32 addrspace(1)* nocapture) nounwind
+declare i32 @escaping.invariant.start({}*) nounwind
+declare void @dummy(i32)
declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64, i32, void (i32)*, i32, i32, ...)
; Function Attrs: nounwind readonly
diff --git a/test/Transforms/SLPVectorizer/X86/call.ll b/test/Transforms/SLPVectorizer/X86/call.ll
index 03b1e837a0c..8397d348483 100644
--- a/test/Transforms/SLPVectorizer/X86/call.ll
+++ b/test/Transforms/SLPVectorizer/X86/call.ll
@@ -11,133 +11,158 @@ declare double @sqrt(double)
declare i64 @round(i64)
-; CHECK: sin_libm
-; CHECK: call <2 x double> @llvm.sin.v2f64
-; CHECK: ret void
-define void @sin_libm(double* %a, double* %b, double* %c) {
-entry:
- %i0 = load double, double* %a, align 8
- %i1 = load double, double* %b, align 8
- %mul = fmul double %i0, %i1
- %call = tail call double @sin(double %mul) nounwind readnone
- %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
- %i3 = load double, double* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
- %i4 = load double, double* %arrayidx4, align 8
- %mul5 = fmul double %i3, %i4
- %call5 = tail call double @sin(double %mul5) nounwind readnone
- store double %call, double* %c, align 8
- %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
- store double %call5, double* %arrayidx5, align 8
+define void @sin_libm(double* %a, double* %b) {
+; CHECK-LABEL: @sin_libm(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* %a to <2 x double>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* %b to <2 x double>*
+; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load double, double* %a, align 8
+ %idx1 = getelementptr inbounds double, double* %a, i64 1
+ %a1 = load double, double* %idx1, align 8
+ %sin1 = tail call double @sin(double %a0) nounwind readnone
+ %sin2 = tail call double @sin(double %a1) nounwind readnone
+ store double %sin1, double* %b, align 8
+ %idx2 = getelementptr inbounds double, double* %b, i64 1
+ store double %sin2, double* %idx2, align 8
ret void
}
-; CHECK: cos_libm
-; CHECK: call <2 x double> @llvm.cos.v2f64
-; CHECK: ret void
-define void @cos_libm(double* %a, double* %b, double* %c) {
-entry:
- %i0 = load double, double* %a, align 8
- %i1 = load double, double* %b, align 8
- %mul = fmul double %i0, %i1
- %call = tail call double @cos(double %mul) nounwind readnone
- %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
- %i3 = load double, double* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
- %i4 = load double, double* %arrayidx4, align 8
- %mul5 = fmul double %i3, %i4
- %call5 = tail call double @cos(double %mul5) nounwind readnone
- store double %call, double* %c, align 8
- %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
- store double %call5, double* %arrayidx5, align 8
+define void @cos_libm(double* %a, double* %b) {
+; CHECK-LABEL: @cos_libm(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* %a to <2 x double>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.cos.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* %b to <2 x double>*
+; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load double, double* %a, align 8
+ %idx1 = getelementptr inbounds double, double* %a, i64 1
+ %a1 = load double, double* %idx1, align 8
+ %cos1 = tail call double @cos(double %a0) nounwind readnone
+ %cos2 = tail call double @cos(double %a1) nounwind readnone
+ store double %cos1, double* %b, align 8
+ %idx2 = getelementptr inbounds double, double* %b, i64 1
+ store double %cos2, double* %idx2, align 8
ret void
}
-; CHECK: pow_libm
-; CHECK: call <2 x double> @llvm.pow.v2f64
-; CHECK: ret void
-define void @pow_libm(double* %a, double* %b, double* %c) {
-entry:
- %i0 = load double, double* %a, align 8
- %i1 = load double, double* %b, align 8
- %mul = fmul double %i0, %i1
- %call = tail call double @pow(double %mul,double %mul) nounwind readnone
- %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
- %i3 = load double, double* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
- %i4 = load double, double* %arrayidx4, align 8
- %mul5 = fmul double %i3, %i4
- %call5 = tail call double @pow(double %mul5,double %mul5) nounwind readnone
- store double %call, double* %c, align 8
- %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
- store double %call5, double* %arrayidx5, align 8
+define void @pow_libm(double* %a, double* %b) {
+; CHECK-LABEL: @pow_libm(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* %a to <2 x double>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* %b to <2 x double>*
+; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load double, double* %a, align 8
+ %idx1 = getelementptr inbounds double, double* %a, i64 1
+ %a1 = load double, double* %idx1, align 8
+ %pow1 = tail call double @pow(double %a0, double %a0) nounwind readnone
+ %pow2 = tail call double @pow(double %a1, double %a1) nounwind readnone
+ store double %pow1, double* %b, align 8
+ %idx2 = getelementptr inbounds double, double* %b, i64 1
+ store double %pow2, double* %idx2, align 8
ret void
}
-
-; CHECK: exp2_libm
-; CHECK: call <2 x double> @llvm.exp2.v2f64
-; CHECK: ret void
-define void @exp2_libm(double* %a, double* %b, double* %c) {
-entry:
- %i0 = load double, double* %a, align 8
- %i1 = load double, double* %b, align 8
- %mul = fmul double %i0, %i1
- %call = tail call double @exp2(double %mul) nounwind readnone
- %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
- %i3 = load double, double* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
- %i4 = load double, double* %arrayidx4, align 8
- %mul5 = fmul double %i3, %i4
- %call5 = tail call double @exp2(double %mul5) nounwind readnone
- store double %call, double* %c, align 8
- %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
- store double %call5, double* %arrayidx5, align 8
+define void @exp_libm(double* %a, double* %b) {
+; CHECK-LABEL: @exp_libm(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* %a to <2 x double>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.exp2.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* %b to <2 x double>*
+; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load double, double* %a, align 8
+ %idx1 = getelementptr inbounds double, double* %a, i64 1
+ %a1 = load double, double* %idx1, align 8
+ %exp1 = tail call double @exp2(double %a0) nounwind readnone
+ %exp2 = tail call double @exp2(double %a1) nounwind readnone
+ store double %exp1, double* %b, align 8
+ %idx2 = getelementptr inbounds double, double* %b, i64 1
+ store double %exp2, double* %idx2, align 8
ret void
}
-
-; CHECK: sqrt_libm
-; CHECK: call nnan <2 x double> @llvm.sqrt.v2f64
-; CHECK: ret void
-define void @sqrt_libm(double* %a, double* %b, double* %c) {
-entry:
- %i0 = load double, double* %a, align 8
- %i1 = load double, double* %b, align 8
- %mul = fmul double %i0, %i1
- %call = tail call nnan double @sqrt(double %mul) nounwind readnone
- %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
- %i3 = load double, double* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
- %i4 = load double, double* %arrayidx4, align 8
- %mul5 = fmul double %i3, %i4
- %call5 = tail call nnan double @sqrt(double %mul5) nounwind readnone
- store double %call, double* %c, align 8
- %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
- store double %call5, double* %arrayidx5, align 8
+; No fast-math-flags are required to convert sqrt library calls to an intrinsic.
+; We just need to know that errno is not set (readnone).
+
+define void @sqrt_libm_no_errno(double* %a, double* %b) {
+; CHECK-LABEL: @sqrt_libm_no_errno(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* %a to <2 x double>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* %b to <2 x double>*
+; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load double, double* %a, align 8
+ %idx1 = getelementptr inbounds double, double* %a, i64 1
+ %a1 = load double, double* %idx1, align 8
+ %sqrt1 = tail call double @sqrt(double %a0) nounwind readnone
+ %sqrt2 = tail call double @sqrt(double %a1) nounwind readnone
+ store double %sqrt1, double* %b, align 8
+ %idx2 = getelementptr inbounds double, double* %b, i64 1
+ store double %sqrt2, double* %idx2, align 8
ret void
}
+; The sqrt intrinsic does not set errno, but a non-constant sqrt call might, so this can't vectorize.
+; The nnan on the call does not matter because there's no guarantee in the C standard that a negative
+; input would result in a nan output ("On a domain error, the function returns an
+; implementation-defined value.")
+
+define void @sqrt_libm_errno(double* %a, double* %b) {
+; CHECK-LABEL: @sqrt_libm_errno(
+; CHECK-NEXT: [[A0:%.*]] = load double, double* %a, align 8
+; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* %a, i64 1
+; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDX1]], align 8
+; CHECK-NEXT: [[SQRT1:%.*]] = tail call nnan double @sqrt(double [[A0]]) #2
+; CHECK-NEXT: [[SQRT2:%.*]] = tail call nnan double @sqrt(double [[A1]]) #2
+; CHECK-NEXT: store double [[SQRT1]], double* %b, align 8
+; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, double* %b, i64 1
+; CHECK-NEXT: store double [[SQRT2]], double* [[IDX2]], align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load double, double* %a, align 8
+ %idx1 = getelementptr inbounds double, double* %a, i64 1
+ %a1 = load double, double* %idx1, align 8
+ %sqrt1 = tail call nnan double @sqrt(double %a0) nounwind
+ %sqrt2 = tail call nnan double @sqrt(double %a1) nounwind
+ store double %sqrt1, double* %b, align 8
+ %idx2 = getelementptr inbounds double, double* %b, i64 1
+ store double %sqrt2, double* %idx2, align 8
+ ret void
+}
; Negative test case
-; CHECK: round_custom
-; CHECK-NOT: load <4 x i64>
-; CHECK: ret void
-define void @round_custom(i64* %a, i64* %b, i64* %c) {
-entry:
- %i0 = load i64, i64* %a, align 8
- %i1 = load i64, i64* %b, align 8
- %mul = mul i64 %i0, %i1
- %call = tail call i64 @round(i64 %mul) nounwind readnone
- %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 1
- %i3 = load i64, i64* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds i64, i64* %b, i64 1
- %i4 = load i64, i64* %arrayidx4, align 8
- %mul5 = mul i64 %i3, %i4
- %call5 = tail call i64 @round(i64 %mul5) nounwind readnone
- store i64 %call, i64* %c, align 8
- %arrayidx5 = getelementptr inbounds i64, i64* %c, i64 1
- store i64 %call5, i64* %arrayidx5, align 8
+define void @round_custom(i64* %a, i64* %b) {
+; CHECK-LABEL: @round_custom(
+; CHECK-NEXT: [[A0:%.*]] = load i64, i64* %a, align 8
+; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds i64, i64* %a, i64 1
+; CHECK-NEXT: [[A1:%.*]] = load i64, i64* [[IDX1]], align 8
+; CHECK-NEXT: [[ROUND1:%.*]] = tail call i64 @round(i64 [[A0]]) #3
+; CHECK-NEXT: [[ROUND2:%.*]] = tail call i64 @round(i64 [[A1]]) #3
+; CHECK-NEXT: store i64 [[ROUND1]], i64* %b, align 8
+; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds i64, i64* %b, i64 1
+; CHECK-NEXT: store i64 [[ROUND2]], i64* [[IDX2]], align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load i64, i64* %a, align 8
+ %idx1 = getelementptr inbounds i64, i64* %a, i64 1
+ %a1 = load i64, i64* %idx1, align 8
+ %round1 = tail call i64 @round(i64 %a0) nounwind readnone
+ %round2 = tail call i64 @round(i64 %a1) nounwind readnone
+ store i64 %round1, i64* %b, align 8
+ %idx2 = getelementptr inbounds i64, i64* %b, i64 1
+ store i64 %round2, i64* %idx2, align 8
ret void
}
diff --git a/test/Transforms/SLPVectorizer/X86/cast.ll b/test/Transforms/SLPVectorizer/X86/cast.ll
index 5d7118753e9..2f9f84948ea 100644
--- a/test/Transforms/SLPVectorizer/X86/cast.ll
+++ b/test/Transforms/SLPVectorizer/X86/cast.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -basicaa -slp-vectorizer -dce -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -basicaa -slp-vectorizer -dce -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -basicaa -slp-vectorizer -dce -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -basicaa -slp-vectorizer -dce -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -14,10 +14,10 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
define i32 @test_sext_4i8_to_4i32(i32* noalias nocapture %A, i8* noalias nocapture %B) {
; CHECK-LABEL: @test_sext_4i8_to_4i32(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* %B to <4 x i8>*
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[B:%.*]] to <4 x i8>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i8> [[TMP1]] to <4 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* %A to <4 x i32>*
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: ret i32 undef
;
@@ -46,10 +46,10 @@ entry:
define i32 @test_zext_4i16_to_4i32(i32* noalias nocapture %A, i16* noalias nocapture %B) {
; CHECK-LABEL: @test_zext_4i16_to_4i32(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* %B to <4 x i16>*
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <4 x i16>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* %A to <4 x i32>*
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: ret i32 undef
;
@@ -76,30 +76,21 @@ entry:
}
define i64 @test_sext_4i16_to_4i64(i64* noalias nocapture %A, i16* noalias nocapture %B) {
-; SSE-LABEL: @test_sext_4i16_to_4i64(
-; SSE-NEXT: entry:
-; SSE-NEXT: [[TMP0:%.*]] = bitcast i16* %B to <2 x i16>*
-; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 1
-; SSE-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i64>
-; SSE-NEXT: [[TMP3:%.*]] = bitcast i64* %A to <2 x i64>*
-; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 4
-; SSE-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* %B, i64 2
-; SSE-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* %A, i64 2
-; SSE-NEXT: [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX5]] to <2 x i16>*
-; SSE-NEXT: [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* [[TMP4]], align 1
-; SSE-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP5]] to <2 x i64>
-; SSE-NEXT: [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX7]] to <2 x i64>*
-; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4
-; SSE-NEXT: ret i64 undef
-;
-; AVX-LABEL: @test_sext_4i16_to_4i64(
-; AVX-NEXT: entry:
-; AVX-NEXT: [[TMP0:%.*]] = bitcast i16* %B to <4 x i16>*
-; AVX-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 1
-; AVX-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i64>
-; AVX-NEXT: [[TMP3:%.*]] = bitcast i64* %A to <4 x i64>*
-; AVX-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 4
-; AVX-NEXT: ret i64 undef
+; CHECK-LABEL: @test_sext_4i16_to_4i64(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <2 x i16>*
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>*
+; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 4
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 2
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX5]] to <2 x i16>*
+; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* [[TMP4]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP5]] to <2 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX7]] to <2 x i64>*
+; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4
+; CHECK-NEXT: ret i64 undef
;
entry:
%0 = load i16, i16* %B, align 1
diff --git a/test/Transforms/SLPVectorizer/X86/load-merge.ll b/test/Transforms/SLPVectorizer/X86/load-merge.ll
new file mode 100644
index 00000000000..df990be073b
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s
+
+;unsigned load_le32(unsigned char *data) {
+; unsigned le32 = (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
+; return le32;
+;}
+
+define i32 @_Z9load_le32Ph(i8* nocapture readonly %data) {
+; CHECK-LABEL: @_Z9load_le32Ph(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[DATA:%.*]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
+; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT: [[SHL3:%.*]] = shl nuw nsw i32 [[CONV2]], 8
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHL3]], [[CONV]]
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 2
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX4]], align 1
+; CHECK-NEXT: [[CONV5:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT: [[SHL6:%.*]] = shl nuw nsw i32 [[CONV5]], 16
+; CHECK-NEXT: [[OR7:%.*]] = or i32 [[OR]], [[SHL6]]
+; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 3
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX8]], align 1
+; CHECK-NEXT: [[CONV9:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT: [[SHL10:%.*]] = shl nuw i32 [[CONV9]], 24
+; CHECK-NEXT: [[OR11:%.*]] = or i32 [[OR7]], [[SHL10]]
+; CHECK-NEXT: ret i32 [[OR11]]
+;
+entry:
+ %0 = load i8, i8* %data, align 1
+ %conv = zext i8 %0 to i32
+ %arrayidx1 = getelementptr inbounds i8, i8* %data, i64 1
+ %1 = load i8, i8* %arrayidx1, align 1
+ %conv2 = zext i8 %1 to i32
+ %shl3 = shl nuw nsw i32 %conv2, 8
+ %or = or i32 %shl3, %conv
+ %arrayidx4 = getelementptr inbounds i8, i8* %data, i64 2
+ %2 = load i8, i8* %arrayidx4, align 1
+ %conv5 = zext i8 %2 to i32
+ %shl6 = shl nuw nsw i32 %conv5, 16
+ %or7 = or i32 %or, %shl6
+ %arrayidx8 = getelementptr inbounds i8, i8* %data, i64 3
+ %3 = load i8, i8* %arrayidx8, align 1
+ %conv9 = zext i8 %3 to i32
+ %shl10 = shl nuw i32 %conv9, 24
+ %or11 = or i32 %or7, %shl10
+ ret i32 %or11
+}
diff --git a/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll b/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
new file mode 100644
index 00000000000..79fb782db8f
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
+
+;void Distance(float *p1, int p2, unsigned long p3[], float p4[]) {
+; long a = p3[0] = 5;
+; p1 += p2;
+; p4[3] += p1[a];
+; p3[0] >>= 5;
+; p3[1] >>= 5;
+; p3[2] >>= 5;
+; p3[3] >>= 5;
+; p1 += p2;
+; p4[0] += p1[p3[0] & a];
+;}
+
+define void @_Z8DistanceIlLi5EEvPfiPmS0_(float* %p1, i32 %p2, i64* %p3, float* %p4) {
+; CHECK-LABEL: @_Z8DistanceIlLi5EEvPfiPmS0_(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: store i64 5, i64* [[P3:%.*]], align 8
+; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[P2:%.*]] to i64
+; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[P1:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 5
+; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX1]], align 4
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[P4:%.*]], i64 3
+; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
+; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* [[P3]], align 8
+; CHECK-NEXT: [[SHR:%.*]] = lshr i64 [[TMP2]], 5
+; CHECK-NEXT: store i64 [[SHR]], i64* [[P3]], align 8
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ARRAYIDX4]] to <2 x i64>*
+; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = lshr <2 x i64> [[TMP4]], <i64 5, i64 5>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX4]] to <2 x i64>*
+; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8
+; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
+; CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[ARRAYIDX8]], align 8
+; CHECK-NEXT: [[SHR9:%.*]] = lshr i64 [[TMP7]], 5
+; CHECK-NEXT: store i64 [[SHR9]], i64* [[ARRAYIDX8]], align 8
+; CHECK-NEXT: [[ADD_PTR11:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT: [[AND:%.*]] = and i64 [[SHR]], 5
+; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[ADD_PTR11]], i64 [[AND]]
+; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX13]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[P4]], align 4
+; CHECK-NEXT: [[ADD15:%.*]] = fadd float [[TMP8]], [[TMP9]]
+; CHECK-NEXT: store float [[ADD15]], float* [[P4]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ store i64 5, i64* %p3, align 8
+ %idx.ext = sext i32 %p2 to i64
+ %add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext
+ %arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5
+ %0 = load float, float* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds float, float* %p4, i64 3
+ %1 = load float, float* %arrayidx2, align 4
+ %add = fadd float %0, %1
+ store float %add, float* %arrayidx2, align 4
+ %2 = load i64, i64* %p3, align 8
+ %shr = lshr i64 %2, 5
+ store i64 %shr, i64* %p3, align 8
+ %arrayidx4 = getelementptr inbounds i64, i64* %p3, i64 1
+ %3 = load i64, i64* %arrayidx4, align 8
+ %shr5 = lshr i64 %3, 5
+ store i64 %shr5, i64* %arrayidx4, align 8
+ %arrayidx6 = getelementptr inbounds i64, i64* %p3, i64 2
+ %4 = load i64, i64* %arrayidx6, align 8
+ %shr7 = lshr i64 %4, 5
+ store i64 %shr7, i64* %arrayidx6, align 8
+ %arrayidx8 = getelementptr inbounds i64, i64* %p3, i64 3
+ %5 = load i64, i64* %arrayidx8, align 8
+ %shr9 = lshr i64 %5, 5
+ store i64 %shr9, i64* %arrayidx8, align 8
+ %add.ptr11 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext
+ %and = and i64 %shr, 5
+ %arrayidx13 = getelementptr inbounds float, float* %add.ptr11, i64 %and
+ %6 = load float, float* %arrayidx13, align 4
+ %7 = load float, float* %p4, align 4
+ %add15 = fadd float %6, %7
+ store float %add15, float* %p4, align 4
+ ret void
+}
diff --git a/test/Transforms/SampleProfile/indirect-call.ll b/test/Transforms/SampleProfile/indirect-call.ll
index 61a1bc51996..0c00639e6c0 100644
--- a/test/Transforms/SampleProfile/indirect-call.ll
+++ b/test/Transforms/SampleProfile/indirect-call.ll
@@ -182,7 +182,7 @@ define void @test_direct() !dbg !22 {
; CHECK: ![[PROF]] = !{!"VP", i32 0, i64 3457, i64 9191153033785521275, i64 2059, i64 -1069303473483922844, i64 1398}
; CHECK: ![[BR1]] = !{!"branch_weights", i32 4000, i32 4000}
; CHECK: ![[BR2]] = !{!"branch_weights", i32 3000, i32 1000}
-; CHECK: ![[VP]] = !{!"VP", i32 0, i64 1000, i64 -6391416044382067764, i64 1000}
+; CHECK: ![[VP]] = !{!"VP", i32 0, i64 8000, i64 -6391416044382067764, i64 1000}
!6 = distinct !DISubprogram(name: "test_inline", scope: !1, file: !1, line: 6, unit: !0)
!7 = !DILocation(line: 7, scope: !6)
!8 = distinct !DISubprogram(name: "test_inline_strip", scope: !1, file: !1, line: 8, unit: !0)
diff --git a/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll b/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll
index a2b94038001..a2ca63d0a2d 100644
--- a/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll
+++ b/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S < %s -simplifycfg -simplifycfg-merge-cond-stores=true -simplifycfg-merge-cond-stores-aggressively=false -phi-node-folding-threshold=2 | FileCheck %s
+; RUN: opt -S < %s -simplifycfg -simplifycfg-merge-cond-stores=true -simplifycfg-merge-cond-stores-aggressively=false -phi-node-folding-threshold=1 | FileCheck %s
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "armv7--linux-gnueabihf"
diff --git a/test/Transforms/WholeProgramDevirt/import-indir.ll b/test/Transforms/WholeProgramDevirt/import-indir.ll
index 052a3494834..927ee16b370 100644
--- a/test/Transforms/WholeProgramDevirt/import-indir.ll
+++ b/test/Transforms/WholeProgramDevirt/import-indir.ll
@@ -7,6 +7,7 @@
; SUMMARY-NEXT: - Linkage: 0
; SUMMARY-NEXT: NotEligibleToImport: false
; SUMMARY-NEXT: Live: true
+; SUMMARY-NEXT: Local: false
; SUMMARY-NEXT: TypeTestAssumeVCalls:
; SUMMARY-NEXT: - GUID: 123
; SUMMARY-NEXT: Offset: 0
diff --git a/test/lit.cfg.py b/test/lit.cfg.py
index 6a5cf69b987..73a3b4b58a8 100644
--- a/test/lit.cfg.py
+++ b/test/lit.cfg.py
@@ -168,6 +168,10 @@ for arch in config.targets_to_build.split():
config.available_features.add(arch.lower() + '-registered-target')
# Features
+known_arches = ["x86_64", "mips64", "ppc64", "aarch64"]
+if (config.host_ldflags.find("-m32") < 0
+ and any(config.llvm_host_triple.startswith(x) for x in known_arches)):
+ config.available_features.add("llvm-64-bits")
# Others/can-execute.txt
if sys.platform not in ['win32']:
diff --git a/test/lit.site.cfg.py.in b/test/lit.site.cfg.py.in
index 19e5cd0d3c2..dff46dcff32 100644
--- a/test/lit.site.cfg.py.in
+++ b/test/lit.site.cfg.py.in
@@ -29,7 +29,6 @@ config.targets_to_build = "@TARGETS_TO_BUILD@"
config.native_target = "@LLVM_NATIVE_ARCH@"
config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')
config.host_os = "@HOST_OS@"
-config.host_arch = "@HOST_ARCH@"
config.host_cc = "@HOST_CC@"
config.host_cxx = "@HOST_CXX@"
config.host_ldflags = "@HOST_LDFLAGS@"
@@ -42,6 +41,8 @@ config.enable_ffi = @LLVM_ENABLE_FFI@
config.build_shared_libs = @BUILD_SHARED_LIBS@
config.link_llvm_dylib = @LLVM_LINK_LLVM_DYLIB@
config.llvm_libxml2_enabled = "@LLVM_LIBXML2_ENABLED@"
+config.llvm_host_triple = '@LLVM_HOST_TRIPLE@'
+config.host_arch = "@HOST_ARCH@"
# Support substitution of the tools_dir with user parameters. This is
# used when we can't determine the tool dir at configuration time.
diff --git a/test/tools/dsymutil/cmdline.test b/test/tools/dsymutil/cmdline.test
index dea28cf3d90..f66858e9ae5 100644
--- a/test/tools/dsymutil/cmdline.test
+++ b/test/tools/dsymutil/cmdline.test
@@ -3,7 +3,7 @@ HELP: OVERVIEW: manipulate archived DWARF debug symbol files.
HELP: USAGE: llvm-dsymutil{{[^ ]*}} [options] <input files>
HELP-NOT: -reverse-iterate
HELP: Specific Options:
-HELP: -arch=<string>
+HELP: -arch=<arch>
HELP: -dump-debug-map
HELP: -flat
HELP: -no-odr
diff --git a/test/tools/gold/X86/asm_undefined2.ll b/test/tools/gold/X86/asm_undefined2.ll
index a170f45a55a..d6ed55a775a 100644
--- a/test/tools/gold/X86/asm_undefined2.ll
+++ b/test/tools/gold/X86/asm_undefined2.ll
@@ -9,10 +9,11 @@
; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
; RUN: --plugin-opt=save-temps \
; RUN: --plugin-opt=thinlto -o %t2 %t.o
-; RUN: llvm-dis < %t.o.5.precodegen.bc | FileCheck %s
+; RUN: llvm-dis < %t.o.5.precodegen.bc | FileCheck --check-prefix=CHECKTHIN %s
; Check that foo is not internalized
; CHECK: define void @foo
+; CHECKTHIN: define dso_local void @foo
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/tools/gold/X86/coff.ll b/test/tools/gold/X86/coff.ll
index 541383ddf51..e3eaa6a928c 100644
--- a/test/tools/gold/X86/coff.ll
+++ b/test/tools/gold/X86/coff.ll
@@ -11,7 +11,7 @@ define void @f() {
ret void
}
-; CHECK: define internal void @g() {
+; CHECK: define internal dso_local void @g() {
define hidden void @g() {
ret void
}
diff --git a/test/tools/gold/X86/common.ll b/test/tools/gold/X86/common.ll
index ca506f6dd2d..5d2c5157f69 100644
--- a/test/tools/gold/X86/common.ll
+++ b/test/tools/gold/X86/common.ll
@@ -46,4 +46,4 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; RUN: llvm-dis %t3.o -o - | FileCheck --check-prefix=MIXED %s
; Mixed ELF and IR. We keep ours as common so the linker will finish the merge.
-; MIXED: @a = common global i16 0, align 8
+; MIXED: @a = common dso_local global i16 0, align 8
diff --git a/test/tools/gold/X86/emit-llvm.ll b/test/tools/gold/X86/emit-llvm.ll
index 70d244c34ec..9aec93a78f0 100644
--- a/test/tools/gold/X86/emit-llvm.ll
+++ b/test/tools/gold/X86/emit-llvm.ll
@@ -48,14 +48,14 @@ target triple = "x86_64-unknown-linux-gnu"
@g8 = external global i32
-; CHECK-DAG: define internal void @f1()
+; CHECK-DAG: define internal dso_local void @f1()
; OPT2-NOT: @f1
define hidden void @f1() {
ret void
}
-; CHECK-DAG: define hidden void @f2()
-; OPT-DAG: define hidden void @f2()
+; CHECK-DAG: define dso_local hidden void @f2()
+; OPT-DAG: define dso_local hidden void @f2()
define hidden void @f2() {
ret void
}
diff --git a/test/tools/gold/X86/global_with_section.ll b/test/tools/gold/X86/global_with_section.ll
index 9023e76a4e6..c8291f8ceae 100644
--- a/test/tools/gold/X86/global_with_section.ll
+++ b/test/tools/gold/X86/global_with_section.ll
@@ -40,16 +40,16 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; We should not internalize @var_with_section due to section
-; CHECK-DAG: @var_with_section = global i32 0, section "some_section"
+; CHECK-DAG: @var_with_section = dso_local global i32 0, section "some_section"
@var_with_section = global i32 0, section "some_section"
; Confirm via a variable with a non-C identifier section that we are getting
; the expected internalization.
-; CHECK-DAG: @var_with_nonC_section = internal global i32 0, section ".nonCsection"
+; CHECK-DAG: @var_with_nonC_section = internal dso_local global i32 0, section ".nonCsection"
@var_with_nonC_section = global i32 0, section ".nonCsection"
; We should not internalize @deadfunc_with_section due to section
-; CHECK-DAG: define void @deadfunc_with_section() section "some_other_section"
+; CHECK-DAG: define dso_local void @deadfunc_with_section() section "some_other_section"
define void @deadfunc_with_section() section "some_other_section" {
call void @deadfunc2_called_from_section()
ret void
@@ -57,7 +57,7 @@ define void @deadfunc_with_section() section "some_other_section" {
; Confirm via a function with a non-C identifier section that we are getting
; the expected internalization.
-; CHECK-DAG: define internal void @deadfunc_with_nonC_section() section ".nonCsection"
+; CHECK-DAG: define internal dso_local void @deadfunc_with_nonC_section() section ".nonCsection"
define void @deadfunc_with_nonC_section() section ".nonCsection" {
call void @deadfunc2_called_from_nonC_section()
ret void
@@ -65,15 +65,15 @@ define void @deadfunc_with_nonC_section() section ".nonCsection" {
; In RegularLTO mode, where we have combined all the IR,
; @deadfunc2_called_from_section can be internalized.
-; CHECK2-REGULARLTO: define internal void @deadfunc2_called_from_section
+; CHECK2-REGULARLTO: define internal dso_local void @deadfunc2_called_from_section
; In ThinLTO mode, we can't internalize it as it needs to be preserved
; (due to the access from @deadfunc_with_section which must be preserved), and
; can't be internalized since the reference is from a different module.
-; CHECK2-THINLTO: define void @deadfunc2_called_from_section
+; CHECK2-THINLTO: define dso_local void @deadfunc2_called_from_section
declare void @deadfunc2_called_from_section()
; Confirm when called from a function with a non-C identifier section that we
; are getting the expected internalization.
-; CHECK2-REGULARLTO: define internal void @deadfunc2_called_from_nonC_section
-; CHECK2-THINLTO: define internal void @deadfunc2_called_from_nonC_section
+; CHECK2-REGULARLTO: define internal dso_local void @deadfunc2_called_from_nonC_section
+; CHECK2-THINLTO: define internal dso_local void @deadfunc2_called_from_nonC_section
declare void @deadfunc2_called_from_nonC_section()
diff --git a/test/tools/gold/X86/parallel.ll b/test/tools/gold/X86/parallel.ll
index 4de694c94c8..7d0e405d5d6 100644
--- a/test/tools/gold/X86/parallel.ll
+++ b/test/tools/gold/X86/parallel.ll
@@ -9,8 +9,8 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-; CHECK-BC0: define void @foo
-; CHECK-BC0: declare void @bar
+; CHECK-BC0: define dso_local void @foo
+; CHECK-BC0: declare dso_local void @bar
; CHECK0-NOT: bar
; CHECK0: T foo
; CHECK0-NOT: bar
@@ -19,8 +19,8 @@ define void @foo() {
ret void
}
-; CHECK-BC1: declare void @foo
-; CHECK-BC1: define void @bar
+; CHECK-BC1: declare dso_local void @foo
+; CHECK-BC1: define dso_local void @bar
; CHECK1-NOT: foo
; CHECK1: T bar
; CHECK1-NOT: foo
diff --git a/test/tools/gold/X86/thinlto_linkonceresolution.ll b/test/tools/gold/X86/thinlto_linkonceresolution.ll
index bf2d22a9ef7..c56d6ce2857 100644
--- a/test/tools/gold/X86/thinlto_linkonceresolution.ll
+++ b/test/tools/gold/X86/thinlto_linkonceresolution.ll
@@ -21,7 +21,7 @@
; confirm the weak linkage directly in the saved opt bitcode files.
; CHECK-NOT: U f
; OPT-NOT: @f()
-; OPT2: define weak_odr hidden void @f()
+; OPT2: define weak_odr dso_local hidden void @f()
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/tools/gold/X86/thinlto_weak_library.ll b/test/tools/gold/X86/thinlto_weak_library.ll
index 6a04fc0db0e..9e7b4794c65 100644
--- a/test/tools/gold/X86/thinlto_weak_library.ll
+++ b/test/tools/gold/X86/thinlto_weak_library.ll
@@ -24,7 +24,7 @@
; copy of f() (and didn't simply convert to available_externally, which
; would incorrectly enable inlining).
; RUN: llvm-dis %t2.o.1.promote.bc -o - | FileCheck %s
-; CHECK: declare i32 @f()
+; CHECK: declare dso_local i32 @f()
; ModuleID = 'thinlto_weak_library.c'
source_filename = "thinlto_weak_library.c"
diff --git a/test/tools/gold/X86/visibility.ll b/test/tools/gold/X86/visibility.ll
index 1c70ebf5c46..61f565d2da4 100644
--- a/test/tools/gold/X86/visibility.ll
+++ b/test/tools/gold/X86/visibility.ll
@@ -17,7 +17,7 @@
; CHECK-NEXT: STV_PROTECTED
; CHECK-NEXT: ]
-; IR: define void @foo
+; IR: define dso_local void @foo
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/tools/llvm-ar/default-add.test b/test/tools/llvm-ar/default-add.test
index 88719e4efce..68e41c24910 100644
--- a/test/tools/llvm-ar/default-add.test
+++ b/test/tools/llvm-ar/default-add.test
@@ -4,7 +4,8 @@ RUN: yaml2obj %S/Inputs/coff.yaml -o %t-coff.o
RUN: rm -f %t.ar
RUN: llvm-ar crs %t.ar %t-macho.o
RUN: grep -q __.SYMDEF %t.ar
-RUN: llvm-ar crs %t.ar %t-coff.o
+Test that an option string prefixed by a dash works.
+RUN: llvm-ar -crs %t.ar %t-coff.o
RUN: grep -q __.SYMDEF %t.ar
RUN: rm -f %t.ar
diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s
new file mode 100644
index 00000000000..f8cfcb8d15c
--- /dev/null
+++ b/test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s
@@ -0,0 +1,195 @@
+# Source (tiny.cc):
+# void a() {}
+# void b() {}
+# int main(int argc, char** argv) {
+# void(*ptr)();
+# if (argc == 1)
+# ptr = &a;
+# else
+# ptr = &b;
+# ptr();
+# }
+# Compile with (output is in tiny.s.0):
+# clang++ -flto -fsanitize=cfi -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt
+# clang++ tiny.o -o tiny -flto -fuse-ld=gold -Wl,-plugin-opt,save-temps
+# clang++ -fsanitize=cfi -flto -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt
+# llvm-lto2 run @tiny.resolution.txt -o tiny.s -filetype=asm
+
+ .text
+ .file "ld-temp.o"
+ .p2align 4, 0x90
+ .type _Z1av.cfi,@function
+_Z1av.cfi:
+.Lfunc_begin0:
+ .file 1 "tiny.cc"
+ .loc 1 1 0
+ .cfi_startproc
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+.Ltmp0:
+ .loc 1 1 11 prologue_end
+ popq %rbp
+ retq
+.Ltmp1:
+.Lfunc_end0:
+ .size _Z1av.cfi, .Lfunc_end0-_Z1av.cfi
+ .cfi_endproc
+
+ .p2align 4, 0x90
+ .type _Z1bv.cfi,@function
+_Z1bv.cfi:
+.Lfunc_begin1:
+ .loc 1 2 0
+ .cfi_startproc
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+.Ltmp2:
+ .loc 1 2 11 prologue_end
+ popq %rbp
+ retq
+.Ltmp3:
+.Lfunc_end1:
+ .size _Z1bv.cfi, .Lfunc_end1-_Z1bv.cfi
+ .cfi_endproc
+
+ .hidden main
+ .globl main
+ .p2align 4, 0x90
+ .type main,@function
+main:
+.Lfunc_begin2:
+ .loc 1 4 0
+ .cfi_startproc
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ subq $32, %rsp
+ movl $0, -8(%rbp)
+ movl %edi, -4(%rbp)
+ movq %rsi, -24(%rbp)
+.Ltmp4:
+ .loc 1 6 12 prologue_end
+ cmpl $1, -4(%rbp)
+ .loc 1 6 7 is_stmt 0
+ jne .LBB2_2
+ .loc 1 0 7
+ leaq _Z1av(%rip), %rax
+ .loc 1 7 9 is_stmt 1
+ movq %rax, -16(%rbp)
+ .loc 1 7 5 is_stmt 0
+ jmp .LBB2_3
+.LBB2_2:
+ .loc 1 0 5
+ leaq _Z1bv(%rip), %rax
+ .loc 1 9 9 is_stmt 1
+ movq %rax, -16(%rbp)
+.LBB2_3:
+ .loc 1 0 9 is_stmt 0
+ leaq .L.cfi.jumptable(%rip), %rcx
+ .loc 1 11 3 is_stmt 1
+ movq -16(%rbp), %rax
+ movq %rax, %rdx
+ subq %rcx, %rdx
+ movq %rdx, %rcx
+ shrq $3, %rcx
+ shlq $61, %rdx
+ orq %rcx, %rdx
+ cmpq $1, %rdx
+ jbe .LBB2_5
+ ud2
+.LBB2_5:
+ callq *%rax
+ .loc 1 12 1
+ movl -8(%rbp), %eax
+ addq $32, %rsp
+ popq %rbp
+ retq
+.Ltmp5:
+.Lfunc_end2:
+ .size main, .Lfunc_end2-main
+ .cfi_endproc
+
+ .p2align 3, 0x90
+ .type .L.cfi.jumptable,@function
+.L.cfi.jumptable:
+.Lfunc_begin3:
+ .cfi_startproc
+ #APP
+ jmp _Z1av.cfi@PLT
+ int3
+ int3
+ int3
+ jmp _Z1bv.cfi@PLT
+ int3
+ int3
+ int3
+
+ #NO_APP
+.Lfunc_end3:
+ .size .L.cfi.jumptable, .Lfunc_end3-.L.cfi.jumptable
+ .cfi_endproc
+
+ .section .debug_str,"MS",@progbits,1
+.Linfo_string0:
+ .asciz "clang version 6.0.0 (trunk 316774)"
+.Linfo_string1:
+ .asciz "tiny.cc"
+.Linfo_string2:
+ .asciz ""
+ .section .debug_abbrev,"",@progbits
+ .byte 1
+ .byte 17
+ .byte 0
+ .byte 37
+ .byte 14
+ .byte 19
+ .byte 5
+ .byte 3
+ .byte 14
+ .byte 16
+ .byte 23
+ .byte 27
+ .byte 14
+ .byte 17
+ .byte 1
+ .byte 18
+ .byte 6
+ .byte 0
+ .byte 0
+ .byte 0
+ .section .debug_info,"",@progbits
+.Lcu_begin0:
+ .long 38
+ .short 4
+ .long .debug_abbrev
+ .byte 8
+ .byte 1
+ .long .Linfo_string0
+ .short 4
+ .long .Linfo_string1
+ .long .Lline_table_start0
+ .long .Linfo_string2
+ .quad .Lfunc_begin0
+ .long .Lfunc_end2-.Lfunc_begin0
+ .section .debug_ranges,"",@progbits
+ .section .debug_macinfo,"",@progbits
+.Lcu_macro_begin0:
+ .byte 0
+
+ .type _Z1av,@function
+_Z1av = .L.cfi.jumptable
+ .type _Z1bv,@function
+_Z1bv = .L.cfi.jumptable+8
+ .ident "clang version 6.0.0 (trunk 316774)"
+ .section ".note.GNU-stack","",@progbits
+ .section .debug_line,"",@progbits
+.Lline_table_start0:
+
diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s
new file mode 100644
index 00000000000..7b5ca07d7e4
--- /dev/null
+++ b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s
@@ -0,0 +1,380 @@
+# Source (tiny.cc):
+# void a() {}
+# void b() {}
+# int main(int argc, char** argv) {
+# void(*ptr)();
+# if (argc == 1)
+# ptr = &a;
+# else
+# ptr = &b;
+# ptr();
+# }
+# Compile with:
+# clang++ -g tiny.cc -S -o tiny.s
+
+ .text
+ .file "tiny.cc"
+ .globl _Z1av # -- Begin function _Z1av
+ .p2align 4, 0x90
+ .type _Z1av,@function
+_Z1av: # @_Z1av
+.Lfunc_begin0:
+ .file 1 "tiny.cc"
+ .loc 1 1 0 # tiny.cc:1:0
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+.Ltmp0:
+ .loc 1 1 11 prologue_end # tiny.cc:1:11
+ popq %rbp
+ .cfi_def_cfa %rsp, 8
+ retq
+.Ltmp1:
+.Lfunc_end0:
+ .size _Z1av, .Lfunc_end0-_Z1av
+ .cfi_endproc
+ # -- End function
+ .globl _Z1bv # -- Begin function _Z1bv
+ .p2align 4, 0x90
+ .type _Z1bv,@function
+_Z1bv: # @_Z1bv
+.Lfunc_begin1:
+ .loc 1 2 0 # tiny.cc:2:0
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+.Ltmp2:
+ .loc 1 2 11 prologue_end # tiny.cc:2:11
+ popq %rbp
+ .cfi_def_cfa %rsp, 8
+ retq
+.Ltmp3:
+.Lfunc_end1:
+ .size _Z1bv, .Lfunc_end1-_Z1bv
+ .cfi_endproc
+ # -- End function
+ .globl main # -- Begin function main
+ .p2align 4, 0x90
+ .type main,@function
+main: # @main
+.Lfunc_begin2:
+ .loc 1 4 0 # tiny.cc:4:0
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ subq $32, %rsp
+ movl $0, -4(%rbp)
+ movl %edi, -8(%rbp)
+ movq %rsi, -16(%rbp)
+.Ltmp4:
+ .loc 1 6 12 prologue_end # tiny.cc:6:12
+ cmpl $1, -8(%rbp)
+.Ltmp5:
+ .loc 1 6 7 is_stmt 0 # tiny.cc:6:7
+ jne .LBB2_2
+# BB#1:
+ .loc 1 0 7 # tiny.cc:0:7
+ movabsq $_Z1av, %rax
+.Ltmp6:
+ .loc 1 7 9 is_stmt 1 # tiny.cc:7:9
+ movq %rax, -24(%rbp)
+ .loc 1 7 5 is_stmt 0 # tiny.cc:7:5
+ jmp .LBB2_3
+.LBB2_2:
+ .loc 1 0 5 # tiny.cc:0:5
+ movabsq $_Z1bv, %rax
+ .loc 1 9 9 is_stmt 1 # tiny.cc:9:9
+ movq %rax, -24(%rbp)
+.Ltmp7:
+.LBB2_3:
+ .loc 1 11 3 # tiny.cc:11:3
+ callq *-24(%rbp)
+ .loc 1 12 1 # tiny.cc:12:1
+ movl -4(%rbp), %eax
+ addq $32, %rsp
+ popq %rbp
+ .cfi_def_cfa %rsp, 8
+ retq
+.Ltmp8:
+.Lfunc_end2:
+ .size main, .Lfunc_end2-main
+ .cfi_endproc
+ # -- End function
+ .section .debug_str,"MS",@progbits,1
+.Linfo_string0:
+ .asciz "clang version 6.0.0 (trunk 317104)" # string offset=0
+.Linfo_string1:
+ .asciz "tiny.cc" # string offset=35
+.Linfo_string2:
+ .asciz "/tmp/a/b" # string offset=43
+.Linfo_string3:
+ .asciz "_Z1av" # string offset=52
+.Linfo_string4:
+ .asciz "a" # string offset=58
+.Linfo_string5:
+ .asciz "_Z1bv" # string offset=60
+.Linfo_string6:
+ .asciz "b" # string offset=66
+.Linfo_string7:
+ .asciz "main" # string offset=68
+.Linfo_string8:
+ .asciz "int" # string offset=73
+.Linfo_string9:
+ .asciz "argc" # string offset=77
+.Linfo_string10:
+ .asciz "argv" # string offset=82
+.Linfo_string11:
+ .asciz "char" # string offset=87
+.Linfo_string12:
+ .asciz "ptr" # string offset=92
+ .section .debug_abbrev,"",@progbits
+ .byte 1 # Abbreviation Code
+ .byte 17 # DW_TAG_compile_unit
+ .byte 1 # DW_CHILDREN_yes
+ .byte 37 # DW_AT_producer
+ .byte 14 # DW_FORM_strp
+ .byte 19 # DW_AT_language
+ .byte 5 # DW_FORM_data2
+ .byte 3 # DW_AT_name
+ .byte 14 # DW_FORM_strp
+ .byte 16 # DW_AT_stmt_list
+ .byte 23 # DW_FORM_sec_offset
+ .byte 27 # DW_AT_comp_dir
+ .byte 14 # DW_FORM_strp
+ .ascii "\264B" # DW_AT_GNU_pubnames
+ .byte 25 # DW_FORM_flag_present
+ .byte 17 # DW_AT_low_pc
+ .byte 1 # DW_FORM_addr
+ .byte 18 # DW_AT_high_pc
+ .byte 6 # DW_FORM_data4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 2 # Abbreviation Code
+ .byte 46 # DW_TAG_subprogram
+ .byte 0 # DW_CHILDREN_no
+ .byte 17 # DW_AT_low_pc
+ .byte 1 # DW_FORM_addr
+ .byte 18 # DW_AT_high_pc
+ .byte 6 # DW_FORM_data4
+ .byte 64 # DW_AT_frame_base
+ .byte 24 # DW_FORM_exprloc
+ .byte 110 # DW_AT_linkage_name
+ .byte 14 # DW_FORM_strp
+ .byte 3 # DW_AT_name
+ .byte 14 # DW_FORM_strp
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 63 # DW_AT_external
+ .byte 25 # DW_FORM_flag_present
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 3 # Abbreviation Code
+ .byte 46 # DW_TAG_subprogram
+ .byte 1 # DW_CHILDREN_yes
+ .byte 17 # DW_AT_low_pc
+ .byte 1 # DW_FORM_addr
+ .byte 18 # DW_AT_high_pc
+ .byte 6 # DW_FORM_data4
+ .byte 64 # DW_AT_frame_base
+ .byte 24 # DW_FORM_exprloc
+ .byte 3 # DW_AT_name
+ .byte 14 # DW_FORM_strp
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 63 # DW_AT_external
+ .byte 25 # DW_FORM_flag_present
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 4 # Abbreviation Code
+ .byte 5 # DW_TAG_formal_parameter
+ .byte 0 # DW_CHILDREN_no
+ .byte 2 # DW_AT_location
+ .byte 24 # DW_FORM_exprloc
+ .byte 3 # DW_AT_name
+ .byte 14 # DW_FORM_strp
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 5 # Abbreviation Code
+ .byte 52 # DW_TAG_variable
+ .byte 0 # DW_CHILDREN_no
+ .byte 2 # DW_AT_location
+ .byte 24 # DW_FORM_exprloc
+ .byte 3 # DW_AT_name
+ .byte 14 # DW_FORM_strp
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 6 # Abbreviation Code
+ .byte 36 # DW_TAG_base_type
+ .byte 0 # DW_CHILDREN_no
+ .byte 3 # DW_AT_name
+ .byte 14 # DW_FORM_strp
+ .byte 62 # DW_AT_encoding
+ .byte 11 # DW_FORM_data1
+ .byte 11 # DW_AT_byte_size
+ .byte 11 # DW_FORM_data1
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 7 # Abbreviation Code
+ .byte 15 # DW_TAG_pointer_type
+ .byte 0 # DW_CHILDREN_no
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 8 # Abbreviation Code
+ .byte 21 # DW_TAG_subroutine_type
+ .byte 0 # DW_CHILDREN_no
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 0 # EOM(3)
+ .section .debug_info,"",@progbits
+.Lcu_begin0:
+ .long 187 # Length of Unit
+ .short 4 # DWARF version number
+ .long .debug_abbrev # Offset Into Abbrev. Section
+ .byte 8 # Address Size (in bytes)
+ .byte 1 # Abbrev [1] 0xb:0xb4 DW_TAG_compile_unit
+ .long .Linfo_string0 # DW_AT_producer
+ .short 4 # DW_AT_language
+ .long .Linfo_string1 # DW_AT_name
+ .long .Lline_table_start0 # DW_AT_stmt_list
+ .long .Linfo_string2 # DW_AT_comp_dir
+ # DW_AT_GNU_pubnames
+ .quad .Lfunc_begin0 # DW_AT_low_pc
+ .long .Lfunc_end2-.Lfunc_begin0 # DW_AT_high_pc
+ .byte 2 # Abbrev [2] 0x2a:0x19 DW_TAG_subprogram
+ .quad .Lfunc_begin0 # DW_AT_low_pc
+ .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc
+ .byte 1 # DW_AT_frame_base
+ .byte 86
+ .long .Linfo_string3 # DW_AT_linkage_name
+ .long .Linfo_string4 # DW_AT_name
+ .byte 1 # DW_AT_decl_file
+ .byte 1 # DW_AT_decl_line
+ # DW_AT_external
+ .byte 2 # Abbrev [2] 0x43:0x19 DW_TAG_subprogram
+ .quad .Lfunc_begin1 # DW_AT_low_pc
+ .long .Lfunc_end1-.Lfunc_begin1 # DW_AT_high_pc
+ .byte 1 # DW_AT_frame_base
+ .byte 86
+ .long .Linfo_string5 # DW_AT_linkage_name
+ .long .Linfo_string6 # DW_AT_name
+ .byte 1 # DW_AT_decl_file
+ .byte 2 # DW_AT_decl_line
+ # DW_AT_external
+ .byte 3 # Abbrev [3] 0x5c:0x44 DW_TAG_subprogram
+ .quad .Lfunc_begin2 # DW_AT_low_pc
+ .long .Lfunc_end2-.Lfunc_begin2 # DW_AT_high_pc
+ .byte 1 # DW_AT_frame_base
+ .byte 86
+ .long .Linfo_string7 # DW_AT_name
+ .byte 1 # DW_AT_decl_file
+ .byte 4 # DW_AT_decl_line
+ .long 160 # DW_AT_type
+ # DW_AT_external
+ .byte 4 # Abbrev [4] 0x75:0xe DW_TAG_formal_parameter
+ .byte 2 # DW_AT_location
+ .byte 145
+ .byte 120
+ .long .Linfo_string9 # DW_AT_name
+ .byte 1 # DW_AT_decl_file
+ .byte 4 # DW_AT_decl_line
+ .long 160 # DW_AT_type
+ .byte 4 # Abbrev [4] 0x83:0xe DW_TAG_formal_parameter
+ .byte 2 # DW_AT_location
+ .byte 145
+ .byte 112
+ .long .Linfo_string10 # DW_AT_name
+ .byte 1 # DW_AT_decl_file
+ .byte 4 # DW_AT_decl_line
+ .long 167 # DW_AT_type
+ .byte 5 # Abbrev [5] 0x91:0xe DW_TAG_variable
+ .byte 2 # DW_AT_location
+ .byte 145
+ .byte 104
+ .long .Linfo_string12 # DW_AT_name
+ .byte 1 # DW_AT_decl_file
+ .byte 5 # DW_AT_decl_line
+ .long 184 # DW_AT_type
+ .byte 0 # End Of Children Mark
+ .byte 6 # Abbrev [6] 0xa0:0x7 DW_TAG_base_type
+ .long .Linfo_string8 # DW_AT_name
+ .byte 5 # DW_AT_encoding
+ .byte 4 # DW_AT_byte_size
+ .byte 7 # Abbrev [7] 0xa7:0x5 DW_TAG_pointer_type
+ .long 172 # DW_AT_type
+ .byte 7 # Abbrev [7] 0xac:0x5 DW_TAG_pointer_type
+ .long 177 # DW_AT_type
+ .byte 6 # Abbrev [6] 0xb1:0x7 DW_TAG_base_type
+ .long .Linfo_string11 # DW_AT_name
+ .byte 6 # DW_AT_encoding
+ .byte 1 # DW_AT_byte_size
+ .byte 7 # Abbrev [7] 0xb8:0x5 DW_TAG_pointer_type
+ .long 189 # DW_AT_type
+ .byte 8 # Abbrev [8] 0xbd:0x1 DW_TAG_subroutine_type
+ .byte 0 # End Of Children Mark
+ .section .debug_ranges,"",@progbits
+ .section .debug_macinfo,"",@progbits
+.Lcu_macro_begin0:
+ .byte 0 # End Of Macro List Mark
+ .section .debug_pubnames,"",@progbits
+ .long .LpubNames_end0-.LpubNames_begin0 # Length of Public Names Info
+.LpubNames_begin0:
+ .short 2 # DWARF Version
+ .long .Lcu_begin0 # Offset of Compilation Unit Info
+ .long 191 # Compilation Unit Length
+ .long 42 # DIE offset
+ .asciz "a" # External Name
+ .long 67 # DIE offset
+ .asciz "b" # External Name
+ .long 92 # DIE offset
+ .asciz "main" # External Name
+ .long 0 # End Mark
+.LpubNames_end0:
+ .section .debug_pubtypes,"",@progbits
+ .long .LpubTypes_end0-.LpubTypes_begin0 # Length of Public Types Info
+.LpubTypes_begin0:
+ .short 2 # DWARF Version
+ .long .Lcu_begin0 # Offset of Compilation Unit Info
+ .long 191 # Compilation Unit Length
+ .long 160 # DIE offset
+ .asciz "int" # External Name
+ .long 177 # DIE offset
+ .asciz "char" # External Name
+ .long 0 # End Mark
+.LpubTypes_end0:
+
+ .ident "clang version 6.0.0 (trunk 317104)"
+ .section ".note.GNU-stack","",@progbits
+ .section .debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s
new file mode 100644
index 00000000000..155f5978b46
--- /dev/null
+++ b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s
@@ -0,0 +1,159 @@
+# Source (tiny.cc):
+# void a() {}
+# void b() {}
+# int main(int argc, char** argv) {
+# void(*ptr)();
+# if (argc == 1)
+# ptr = &a;
+# else
+# ptr = &b;
+# ptr();
+# }
+# Compile with:
+# clang++ -gmlt tiny.cc -S -o tiny.s
+
+ .text
+ .file "tiny.cc"
+ .globl _Z1av # -- Begin function _Z1av
+ .p2align 4, 0x90
+ .type _Z1av,@function
+_Z1av: # @_Z1av
+.Lfunc_begin0:
+ .file 1 "tiny.cc"
+ .loc 1 1 0 # tiny.cc:1:0
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+.Ltmp0:
+ .loc 1 1 11 prologue_end # tiny.cc:1:11
+ popq %rbp
+ retq
+.Ltmp1:
+.Lfunc_end0:
+ .size _Z1av, .Lfunc_end0-_Z1av
+ .cfi_endproc
+ # -- End function
+ .globl _Z1bv # -- Begin function _Z1bv
+ .p2align 4, 0x90
+ .type _Z1bv,@function
+_Z1bv: # @_Z1bv
+.Lfunc_begin1:
+ .loc 1 2 0 # tiny.cc:2:0
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+.Ltmp2:
+ .loc 1 2 11 prologue_end # tiny.cc:2:11
+ popq %rbp
+ retq
+.Ltmp3:
+.Lfunc_end1:
+ .size _Z1bv, .Lfunc_end1-_Z1bv
+ .cfi_endproc
+ # -- End function
+ .globl main # -- Begin function main
+ .p2align 4, 0x90
+ .type main,@function
+main: # @main
+.Lfunc_begin2:
+ .loc 1 4 0 # tiny.cc:4:0
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ subq $32, %rsp
+ movl $0, -4(%rbp)
+ movl %edi, -8(%rbp)
+ movq %rsi, -16(%rbp)
+.Ltmp4:
+ .loc 1 6 12 prologue_end # tiny.cc:6:12
+ cmpl $1, -8(%rbp)
+ .loc 1 6 7 is_stmt 0 # tiny.cc:6:7
+ jne .LBB2_2
+# BB#1:
+ .loc 1 0 7 # tiny.cc:0:7
+ movabsq $_Z1av, %rax
+ .loc 1 7 9 is_stmt 1 # tiny.cc:7:9
+ movq %rax, -24(%rbp)
+ .loc 1 7 5 is_stmt 0 # tiny.cc:7:5
+ jmp .LBB2_3
+.LBB2_2:
+ .loc 1 0 5 # tiny.cc:0:5
+ movabsq $_Z1bv, %rax
+ .loc 1 9 9 is_stmt 1 # tiny.cc:9:9
+ movq %rax, -24(%rbp)
+.LBB2_3:
+ .loc 1 11 3 # tiny.cc:11:3
+ callq *-24(%rbp)
+ .loc 1 12 1 # tiny.cc:12:1
+ movl -4(%rbp), %eax
+ addq $32, %rsp
+ popq %rbp
+ retq
+.Ltmp5:
+.Lfunc_end2:
+ .size main, .Lfunc_end2-main
+ .cfi_endproc
+ # -- End function
+ .section .debug_str,"MS",@progbits,1
+.Linfo_string0:
+ .asciz "clang version 6.0.0 (trunk 316774)" # string offset=0
+.Linfo_string1:
+ .asciz "tiny.cc" # string offset=35
+.Linfo_string2:
+ .asciz "/tmp/a/b" # string offset=43
+ .section .debug_abbrev,"",@progbits
+ .byte 1 # Abbreviation Code
+ .byte 17 # DW_TAG_compile_unit
+ .byte 0 # DW_CHILDREN_no
+ .byte 37 # DW_AT_producer
+ .byte 14 # DW_FORM_strp
+ .byte 19 # DW_AT_language
+ .byte 5 # DW_FORM_data2
+ .byte 3 # DW_AT_name
+ .byte 14 # DW_FORM_strp
+ .byte 16 # DW_AT_stmt_list
+ .byte 23 # DW_FORM_sec_offset
+ .byte 27 # DW_AT_comp_dir
+ .byte 14 # DW_FORM_strp
+ .byte 17 # DW_AT_low_pc
+ .byte 1 # DW_FORM_addr
+ .byte 18 # DW_AT_high_pc
+ .byte 6 # DW_FORM_data4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 0 # EOM(3)
+ .section .debug_info,"",@progbits
+.Lcu_begin0:
+ .long 38 # Length of Unit
+ .short 4 # DWARF version number
+ .long .debug_abbrev # Offset Into Abbrev. Section
+ .byte 8 # Address Size (in bytes)
+ .byte 1 # Abbrev [1] 0xb:0x1f DW_TAG_compile_unit
+ .long .Linfo_string0 # DW_AT_producer
+ .short 4 # DW_AT_language
+ .long .Linfo_string1 # DW_AT_name
+ .long .Lline_table_start0 # DW_AT_stmt_list
+ .long .Linfo_string2 # DW_AT_comp_dir
+ .quad .Lfunc_begin0 # DW_AT_low_pc
+ .long .Lfunc_end2-.Lfunc_begin0 # DW_AT_high_pc
+ .section .debug_ranges,"",@progbits
+ .section .debug_macinfo,"",@progbits
+.Lcu_macro_begin0:
+ .byte 0 # End Of Macro List Mark
+
+ .ident "clang version 6.0.0 (trunk 316774)"
+ .section ".note.GNU-stack","",@progbits
+ .section .debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s
new file mode 100644
index 00000000000..2d3cf2f484e
--- /dev/null
+++ b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s
@@ -0,0 +1,87 @@
+# Source (tiny.cc):
+# void a() {}
+# void b() {}
+# int main(int argc, char** argv) {
+# void(*ptr)();
+# if (argc == 1)
+# ptr = &a;
+# else
+# ptr = &b;
+# ptr();
+# }
+# Compile with:
+# clang++ tiny.cc -S -o tiny.s
+
+ .text
+ .file "tiny.cc"
+ .globl _Z1av # -- Begin function _Z1av
+ .p2align 4, 0x90
+ .type _Z1av,@function
+_Z1av: # @_Z1av
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ popq %rbp
+ retq
+.Lfunc_end0:
+ .size _Z1av, .Lfunc_end0-_Z1av
+ .cfi_endproc
+ # -- End function
+ .globl _Z1bv # -- Begin function _Z1bv
+ .p2align 4, 0x90
+ .type _Z1bv,@function
+_Z1bv: # @_Z1bv
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ popq %rbp
+ retq
+.Lfunc_end1:
+ .size _Z1bv, .Lfunc_end1-_Z1bv
+ .cfi_endproc
+ # -- End function
+ .globl main # -- Begin function main
+ .p2align 4, 0x90
+ .type main,@function
+main: # @main
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ subq $32, %rsp
+ movl $0, -4(%rbp)
+ movl %edi, -8(%rbp)
+ movq %rsi, -16(%rbp)
+ cmpl $1, -8(%rbp)
+ jne .LBB2_2
+# BB#1:
+ movabsq $_Z1av, %rax
+ movq %rax, -24(%rbp)
+ jmp .LBB2_3
+.LBB2_2:
+ movabsq $_Z1bv, %rax
+ movq %rax, -24(%rbp)
+.LBB2_3:
+ callq *-24(%rbp)
+ movl -4(%rbp), %eax
+ addq $32, %rsp
+ popq %rbp
+ retq
+.Lfunc_end2:
+ .size main, .Lfunc_end2-main
+ .cfi_endproc
+ # -- End function
+
+ .ident "clang version 6.0.0 (trunk 316774)"
+ .section ".note.GNU-stack","",@progbits
diff --git a/test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s b/test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s
new file mode 100644
index 00000000000..fbcfcc2a7cc
--- /dev/null
+++ b/test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc %S/Inputs/unprotected-lineinfo.s -filetype obj \
+# RUN: -triple x86_64-linux-elf -o %t.o
+# RUN: echo "src:*tiny*" > %t.blacklist.txt
+# RUN: llvm-cfi-verify %t.o %t.blacklist.txt | FileCheck %s
+
+# CHECK-LABEL: U
+# CHECK-NEXT: tiny.cc:11
+# CHECK-NEXT: BLACKLIST MATCH, 'src'
+# CHECK-NEXT: ====> Expected Unprotected
+
+# CHECK: Expected Protected: 0 (0.00%)
+# CHECK: Unexpected Protected: 0 (0.00%)
+# CHECK: Expected Unprotected: 1 (100.00%)
+# CHECK: Unexpected Unprotected (BAD): 0 (0.00%)
+
+# Source: (blacklist.txt):
+# src:*tiny*
diff --git a/test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s b/test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s
new file mode 100644
index 00000000000..3ea829395c4
--- /dev/null
+++ b/test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc %S/Inputs/unprotected-fullinfo.s -filetype obj \
+# RUN: -triple x86_64-linux-elf -o %t.o
+# RUN: echo "fun:*main*" > %t.blacklist.txt
+# RUN: llvm-cfi-verify %t.o %t.blacklist.txt | FileCheck %s
+
+# CHECK-LABEL: U
+# CHECK-NEXT: tiny.cc:11
+# CHECK-NEXT: BLACKLIST MATCH, 'fun'
+# CHECK-NEXT: ====> Expected Unprotected
+
+# CHECK: Expected Protected: 0 (0.00%)
+# CHECK: Unexpected Protected: 0 (0.00%)
+# CHECK: Expected Unprotected: 1 (100.00%)
+# CHECK: Unexpected Unprotected (BAD): 0 (0.00%)
+
+# Source: (blacklist.txt):
+# fun:*main*
diff --git a/test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s b/test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s
new file mode 100644
index 00000000000..c6ddf2b5d11
--- /dev/null
+++ b/test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc %S/Inputs/protected-lineinfo.s -filetype obj \
+# RUN: -triple x86_64-linux-elf -o %t.o
+# RUN: echo "src:*tiny*" > %t.blacklist.txt
+# RUN: llvm-cfi-verify %t.o %t.blacklist.txt | FileCheck %s
+
+# CHECK-LABEL: P
+# CHECK-NEXT: tiny.cc:11
+# CHECK-NEXT: BLACKLIST MATCH, 'src'
+# CHECK-NEXT: ====> Unexpected Protected
+
+# CHECK: Expected Protected: 0 (0.00%)
+# CHECK: Unexpected Protected: 1 (100.00%)
+# CHECK: Expected Unprotected: 0 (0.00%)
+# CHECK: Unexpected Unprotected (BAD): 0 (0.00%)
+
+# Source: (blacklist.txt):
+# src:*tiny*
diff --git a/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s b/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s
index bf1d87a2eb8..e9b873471cb 100644
--- a/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s
+++ b/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s
@@ -10,7 +10,10 @@
# reporting of the cfi-verify program. It should only find a single indirect CF
# instruction at `tiny.cc:11` (see protected-lineinfo.s for the source).
-# CHECK: Unprotected: 0 (0.00%), Protected: 1 (100.00%)
+# CHECK: Expected Protected: 1 (100.00%)
+# CHECK: Unexpected Protected: 0 (0.00%)
+# CHECK: Expected Unprotected: 0 (0.00%)
+# CHECK: Unexpected Unprotected (BAD): 0 (0.00%)
.text
.file "ld-temp.o"
diff --git a/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s
index e3bb0f7af46..8eaf2e5e725 100644
--- a/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s
+++ b/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s
@@ -1,203 +1,11 @@
-# RUN: llvm-mc %s -filetype obj -triple x86_64-linux-elf -o %t.o
+# RUN: llvm-mc %S/Inputs/protected-lineinfo.s -filetype obj \
+# RUN: -triple x86_64-linux-elf -o %t.o
# RUN: llvm-cfi-verify %t.o | FileCheck %s
# CHECK-LABEL: P
# CHECK-NEXT: tiny.cc:11
-# CHECK: Unprotected: 0 (0.00%), Protected: 1 (100.00%)
-
-# Source (tiny.cc):
-# void a() {}
-# void b() {}
-# int main(int argc, char** argv) {
-# void(*ptr)();
-# if (argc == 1)
-# ptr = &a;
-# else
-# ptr = &b;
-# ptr();
-# }
-# Compile with (output is in tiny.s.0):
-# clang++ -flto -fsanitize=cfi -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt
-# clang++ tiny.o -o tiny -flto -fuse-ld=gold -Wl,-plugin-opt,save-temps
-# clang++ -fsanitize=cfi -flto -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt
-# llvm-lto2 run @tiny.resolution.txt -o tiny.s -filetype=asm
-
- .text
- .file "ld-temp.o"
- .p2align 4, 0x90
- .type _Z1av.cfi,@function
-_Z1av.cfi:
-.Lfunc_begin0:
- .file 1 "tiny.cc"
- .loc 1 1 0
- .cfi_startproc
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
-.Ltmp0:
- .loc 1 1 11 prologue_end
- popq %rbp
- retq
-.Ltmp1:
-.Lfunc_end0:
- .size _Z1av.cfi, .Lfunc_end0-_Z1av.cfi
- .cfi_endproc
-
- .p2align 4, 0x90
- .type _Z1bv.cfi,@function
-_Z1bv.cfi:
-.Lfunc_begin1:
- .loc 1 2 0
- .cfi_startproc
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
-.Ltmp2:
- .loc 1 2 11 prologue_end
- popq %rbp
- retq
-.Ltmp3:
-.Lfunc_end1:
- .size _Z1bv.cfi, .Lfunc_end1-_Z1bv.cfi
- .cfi_endproc
-
- .hidden main
- .globl main
- .p2align 4, 0x90
- .type main,@function
-main:
-.Lfunc_begin2:
- .loc 1 4 0
- .cfi_startproc
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
- subq $32, %rsp
- movl $0, -8(%rbp)
- movl %edi, -4(%rbp)
- movq %rsi, -24(%rbp)
-.Ltmp4:
- .loc 1 6 12 prologue_end
- cmpl $1, -4(%rbp)
- .loc 1 6 7 is_stmt 0
- jne .LBB2_2
- .loc 1 0 7
- leaq _Z1av(%rip), %rax
- .loc 1 7 9 is_stmt 1
- movq %rax, -16(%rbp)
- .loc 1 7 5 is_stmt 0
- jmp .LBB2_3
-.LBB2_2:
- .loc 1 0 5
- leaq _Z1bv(%rip), %rax
- .loc 1 9 9 is_stmt 1
- movq %rax, -16(%rbp)
-.LBB2_3:
- .loc 1 0 9 is_stmt 0
- leaq .L.cfi.jumptable(%rip), %rcx
- .loc 1 11 3 is_stmt 1
- movq -16(%rbp), %rax
- movq %rax, %rdx
- subq %rcx, %rdx
- movq %rdx, %rcx
- shrq $3, %rcx
- shlq $61, %rdx
- orq %rcx, %rdx
- cmpq $1, %rdx
- jbe .LBB2_5
- ud2
-.LBB2_5:
- callq *%rax
- .loc 1 12 1
- movl -8(%rbp), %eax
- addq $32, %rsp
- popq %rbp
- retq
-.Ltmp5:
-.Lfunc_end2:
- .size main, .Lfunc_end2-main
- .cfi_endproc
-
- .p2align 3, 0x90
- .type .L.cfi.jumptable,@function
-.L.cfi.jumptable:
-.Lfunc_begin3:
- .cfi_startproc
- #APP
- jmp _Z1av.cfi@PLT
- int3
- int3
- int3
- jmp _Z1bv.cfi@PLT
- int3
- int3
- int3
-
- #NO_APP
-.Lfunc_end3:
- .size .L.cfi.jumptable, .Lfunc_end3-.L.cfi.jumptable
- .cfi_endproc
-
- .section .debug_str,"MS",@progbits,1
-.Linfo_string0:
- .asciz "clang version 6.0.0 (trunk 316774)"
-.Linfo_string1:
- .asciz "tiny.cc"
-.Linfo_string2:
- .asciz ""
- .section .debug_abbrev,"",@progbits
- .byte 1
- .byte 17
- .byte 0
- .byte 37
- .byte 14
- .byte 19
- .byte 5
- .byte 3
- .byte 14
- .byte 16
- .byte 23
- .byte 27
- .byte 14
- .byte 17
- .byte 1
- .byte 18
- .byte 6
- .byte 0
- .byte 0
- .byte 0
- .section .debug_info,"",@progbits
-.Lcu_begin0:
- .long 38
- .short 4
- .long .debug_abbrev
- .byte 8
- .byte 1
- .long .Linfo_string0
- .short 4
- .long .Linfo_string1
- .long .Lline_table_start0
- .long .Linfo_string2
- .quad .Lfunc_begin0
- .long .Lfunc_end2-.Lfunc_begin0
- .section .debug_ranges,"",@progbits
- .section .debug_macinfo,"",@progbits
-.Lcu_macro_begin0:
- .byte 0
-
- .type _Z1av,@function
-_Z1av = .L.cfi.jumptable
- .type _Z1bv,@function
-_Z1bv = .L.cfi.jumptable+8
- .ident "clang version 6.0.0 (trunk 316774)"
- .section ".note.GNU-stack","",@progbits
- .section .debug_line,"",@progbits
-.Lline_table_start0:
-
+# CHECK: Expected Protected: 1 (100.00%)
+# CHECK: Unexpected Protected: 0 (0.00%)
+# CHECK: Expected Unprotected: 0 (0.00%)
+# CHECK: Unexpected Unprotected (BAD): 0 (0.00%)
diff --git a/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s
index d8819e16e37..65782cb5e42 100644
--- a/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s
+++ b/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s
@@ -1,167 +1,11 @@
-# RUN: llvm-mc %s -filetype obj -triple x86_64-linux-elf -o %t.o
+# RUN: llvm-mc %S/Inputs/unprotected-lineinfo.s -filetype obj \
+# RUN: -triple x86_64-linux-elf -o %t.o
# RUN: llvm-cfi-verify %t.o | FileCheck %s
# CHECK-LABEL: U
# CHECK-NEXT: tiny.cc:11
-# CHECK: Unprotected: 1 (100.00%), Protected: 0 (0.00%)
-
-# Source (tiny.cc):
-# void a() {}
-# void b() {}
-# int main(int argc, char** argv) {
-# void(*ptr)();
-# if (argc == 1)
-# ptr = &a;
-# else
-# ptr = &b;
-# ptr();
-# }
-# Compile with:
-# clang++ -gmlt tiny.cc -S -o tiny.s
-
- .text
- .file "tiny.cc"
- .globl _Z1av # -- Begin function _Z1av
- .p2align 4, 0x90
- .type _Z1av,@function
-_Z1av: # @_Z1av
-.Lfunc_begin0:
- .file 1 "tiny.cc"
- .loc 1 1 0 # tiny.cc:1:0
- .cfi_startproc
-# BB#0:
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
-.Ltmp0:
- .loc 1 1 11 prologue_end # tiny.cc:1:11
- popq %rbp
- retq
-.Ltmp1:
-.Lfunc_end0:
- .size _Z1av, .Lfunc_end0-_Z1av
- .cfi_endproc
- # -- End function
- .globl _Z1bv # -- Begin function _Z1bv
- .p2align 4, 0x90
- .type _Z1bv,@function
-_Z1bv: # @_Z1bv
-.Lfunc_begin1:
- .loc 1 2 0 # tiny.cc:2:0
- .cfi_startproc
-# BB#0:
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
-.Ltmp2:
- .loc 1 2 11 prologue_end # tiny.cc:2:11
- popq %rbp
- retq
-.Ltmp3:
-.Lfunc_end1:
- .size _Z1bv, .Lfunc_end1-_Z1bv
- .cfi_endproc
- # -- End function
- .globl main # -- Begin function main
- .p2align 4, 0x90
- .type main,@function
-main: # @main
-.Lfunc_begin2:
- .loc 1 4 0 # tiny.cc:4:0
- .cfi_startproc
-# BB#0:
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
- subq $32, %rsp
- movl $0, -4(%rbp)
- movl %edi, -8(%rbp)
- movq %rsi, -16(%rbp)
-.Ltmp4:
- .loc 1 6 12 prologue_end # tiny.cc:6:12
- cmpl $1, -8(%rbp)
- .loc 1 6 7 is_stmt 0 # tiny.cc:6:7
- jne .LBB2_2
-# BB#1:
- .loc 1 0 7 # tiny.cc:0:7
- movabsq $_Z1av, %rax
- .loc 1 7 9 is_stmt 1 # tiny.cc:7:9
- movq %rax, -24(%rbp)
- .loc 1 7 5 is_stmt 0 # tiny.cc:7:5
- jmp .LBB2_3
-.LBB2_2:
- .loc 1 0 5 # tiny.cc:0:5
- movabsq $_Z1bv, %rax
- .loc 1 9 9 is_stmt 1 # tiny.cc:9:9
- movq %rax, -24(%rbp)
-.LBB2_3:
- .loc 1 11 3 # tiny.cc:11:3
- callq *-24(%rbp)
- .loc 1 12 1 # tiny.cc:12:1
- movl -4(%rbp), %eax
- addq $32, %rsp
- popq %rbp
- retq
-.Ltmp5:
-.Lfunc_end2:
- .size main, .Lfunc_end2-main
- .cfi_endproc
- # -- End function
- .section .debug_str,"MS",@progbits,1
-.Linfo_string0:
- .asciz "clang version 6.0.0 (trunk 316774)" # string offset=0
-.Linfo_string1:
- .asciz "tiny.cc" # string offset=35
-.Linfo_string2:
- .asciz "/tmp/a/b" # string offset=43
- .section .debug_abbrev,"",@progbits
- .byte 1 # Abbreviation Code
- .byte 17 # DW_TAG_compile_unit
- .byte 0 # DW_CHILDREN_no
- .byte 37 # DW_AT_producer
- .byte 14 # DW_FORM_strp
- .byte 19 # DW_AT_language
- .byte 5 # DW_FORM_data2
- .byte 3 # DW_AT_name
- .byte 14 # DW_FORM_strp
- .byte 16 # DW_AT_stmt_list
- .byte 23 # DW_FORM_sec_offset
- .byte 27 # DW_AT_comp_dir
- .byte 14 # DW_FORM_strp
- .byte 17 # DW_AT_low_pc
- .byte 1 # DW_FORM_addr
- .byte 18 # DW_AT_high_pc
- .byte 6 # DW_FORM_data4
- .byte 0 # EOM(1)
- .byte 0 # EOM(2)
- .byte 0 # EOM(3)
- .section .debug_info,"",@progbits
-.Lcu_begin0:
- .long 38 # Length of Unit
- .short 4 # DWARF version number
- .long .debug_abbrev # Offset Into Abbrev. Section
- .byte 8 # Address Size (in bytes)
- .byte 1 # Abbrev [1] 0xb:0x1f DW_TAG_compile_unit
- .long .Linfo_string0 # DW_AT_producer
- .short 4 # DW_AT_language
- .long .Linfo_string1 # DW_AT_name
- .long .Lline_table_start0 # DW_AT_stmt_list
- .long .Linfo_string2 # DW_AT_comp_dir
- .quad .Lfunc_begin0 # DW_AT_low_pc
- .long .Lfunc_end2-.Lfunc_begin0 # DW_AT_high_pc
- .section .debug_ranges,"",@progbits
- .section .debug_macinfo,"",@progbits
-.Lcu_macro_begin0:
- .byte 0 # End Of Macro List Mark
-
- .ident "clang version 6.0.0 (trunk 316774)"
- .section ".note.GNU-stack","",@progbits
- .section .debug_line,"",@progbits
-.Lline_table_start0:
+# CHECK: Expected Protected: 0 (0.00%)
+# CHECK: Unexpected Protected: 0 (0.00%)
+# CHECK: Expected Unprotected: 0 (0.00%)
+# CHECK: Unexpected Unprotected (BAD): 1 (100.00%)
diff --git a/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s b/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s
index c023a4a84ab..246acf35f5b 100644
--- a/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s
+++ b/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s
@@ -1,92 +1,5 @@
-# RUN: llvm-mc %s -filetype obj -triple x86_64-linux-elf -o %t.o
+# RUN: llvm-mc %S/Inputs/unprotected-nolineinfo.s -filetype obj \
+# RUN: -triple x86_64-linux-elf -o %t.o
# RUN: not llvm-cfi-verify %t.o 2>&1 | FileCheck %s
# CHECK: DWARF line information missing. Did you compile with '-g'?
-
-# Source (tiny.cc):
-# void a() {}
-# void b() {}
-# int main(int argc, char** argv) {
-# void(*ptr)();
-# if (argc == 1)
-# ptr = &a;
-# else
-# ptr = &b;
-# ptr();
-# }
-# Compile with:
-# clang++ tiny.cc -S -o tiny.s
-
- .text
- .file "tiny.cc"
- .globl _Z1av # -- Begin function _Z1av
- .p2align 4, 0x90
- .type _Z1av,@function
-_Z1av: # @_Z1av
- .cfi_startproc
-# BB#0:
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
- popq %rbp
- retq
-.Lfunc_end0:
- .size _Z1av, .Lfunc_end0-_Z1av
- .cfi_endproc
- # -- End function
- .globl _Z1bv # -- Begin function _Z1bv
- .p2align 4, 0x90
- .type _Z1bv,@function
-_Z1bv: # @_Z1bv
- .cfi_startproc
-# BB#0:
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
- popq %rbp
- retq
-.Lfunc_end1:
- .size _Z1bv, .Lfunc_end1-_Z1bv
- .cfi_endproc
- # -- End function
- .globl main # -- Begin function main
- .p2align 4, 0x90
- .type main,@function
-main: # @main
- .cfi_startproc
-# BB#0:
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
- subq $32, %rsp
- movl $0, -4(%rbp)
- movl %edi, -8(%rbp)
- movq %rsi, -16(%rbp)
- cmpl $1, -8(%rbp)
- jne .LBB2_2
-# BB#1:
- movabsq $_Z1av, %rax
- movq %rax, -24(%rbp)
- jmp .LBB2_3
-.LBB2_2:
- movabsq $_Z1bv, %rax
- movq %rax, -24(%rbp)
-.LBB2_3:
- callq *-24(%rbp)
- movl -4(%rbp), %eax
- addq $32, %rsp
- popq %rbp
- retq
-.Lfunc_end2:
- .size main, .Lfunc_end2-main
- .cfi_endproc
- # -- End function
-
- .ident "clang version 6.0.0 (trunk 316774)"
- .section ".note.GNU-stack","",@progbits
diff --git a/test/LibDriver/Inputs/a.s b/test/tools/llvm-lib/Inputs/a.s
index 88258e2797f..88258e2797f 100644
--- a/test/LibDriver/Inputs/a.s
+++ b/test/tools/llvm-lib/Inputs/a.s
diff --git a/test/LibDriver/Inputs/b.s b/test/tools/llvm-lib/Inputs/b.s
index 4890c9247c7..4890c9247c7 100644
--- a/test/LibDriver/Inputs/b.s
+++ b/test/tools/llvm-lib/Inputs/b.s
diff --git a/test/LibDriver/Inputs/cl-gl.obj b/test/tools/llvm-lib/Inputs/cl-gl.obj
index ff746557d41..ff746557d41 100755
--- a/test/LibDriver/Inputs/cl-gl.obj
+++ b/test/tools/llvm-lib/Inputs/cl-gl.obj
Binary files differ
diff --git a/test/LibDriver/Inputs/resource.res b/test/tools/llvm-lib/Inputs/resource.res
index f1c799fbbb0..f1c799fbbb0 100644
--- a/test/LibDriver/Inputs/resource.res
+++ b/test/tools/llvm-lib/Inputs/resource.res
Binary files differ
diff --git a/test/LibDriver/infer-output-path.test b/test/tools/llvm-lib/infer-output-path.test
index c63b0abdf6e..c63b0abdf6e 100644
--- a/test/LibDriver/infer-output-path.test
+++ b/test/tools/llvm-lib/infer-output-path.test
diff --git a/test/LibDriver/invalid.test b/test/tools/llvm-lib/invalid.test
index 2978177a431..2978177a431 100644
--- a/test/LibDriver/invalid.test
+++ b/test/tools/llvm-lib/invalid.test
diff --git a/test/LibDriver/libpath.test b/test/tools/llvm-lib/libpath.test
index 26a1e8dc8b6..26a1e8dc8b6 100644
--- a/test/LibDriver/libpath.test
+++ b/test/tools/llvm-lib/libpath.test
diff --git a/test/tools/llvm-lib/lit.local.cfg b/test/tools/llvm-lib/lit.local.cfg
new file mode 100644
index 00000000000..e71f3cc4c41
--- /dev/null
+++ b/test/tools/llvm-lib/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'X86' in config.root.targets:
+ config.unsupported = True
+
diff --git a/test/LibDriver/no-inputs.test b/test/tools/llvm-lib/no-inputs.test
index 95d6555d58c..95d6555d58c 100644
--- a/test/LibDriver/no-inputs.test
+++ b/test/tools/llvm-lib/no-inputs.test
diff --git a/test/LibDriver/resource.test b/test/tools/llvm-lib/resource.test
index 6c3dad50b45..6c3dad50b45 100644
--- a/test/LibDriver/resource.test
+++ b/test/tools/llvm-lib/resource.test
diff --git a/test/LibDriver/thin.test b/test/tools/llvm-lib/thin.test
index c401de41a80..c401de41a80 100644
--- a/test/LibDriver/thin.test
+++ b/test/tools/llvm-lib/thin.test
diff --git a/test/LibDriver/use-paths.test b/test/tools/llvm-lib/use-paths.test
index 971c216127e..971c216127e 100644
--- a/test/LibDriver/use-paths.test
+++ b/test/tools/llvm-lib/use-paths.test
diff --git a/test/tools/llvm-nm/X86/externalonly.test b/test/tools/llvm-nm/X86/externalonly.test
index c3741298786..2a1853b426f 100644
--- a/test/tools/llvm-nm/X86/externalonly.test
+++ b/test/tools/llvm-nm/X86/externalonly.test
@@ -1,4 +1,5 @@
# RUN: llvm-nm -g %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s
+# RUN: llvm-nm -g -g %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s
# CHECK-NOT: EH_frame0
# CHECK: _main
diff --git a/test/tools/llvm-nm/X86/importlibrary.test b/test/tools/llvm-nm/X86/importlibrary.test
index 9111694c2c6..107628d09ef 100644
--- a/test/tools/llvm-nm/X86/importlibrary.test
+++ b/test/tools/llvm-nm/X86/importlibrary.test
@@ -1,5 +1,7 @@
# RUN: llvm-nm -B %S/Inputs/example.lib | FileCheck --match-full-lines %s
+CHECK: 00000000 I __IMPORT_DESCRIPTOR_example
+CHECK: 00000000 I __NULL_IMPORT_DESCRIPTOR
CHECK: 00000000 R __imp__constant
CHECK: 00000000 R _constant
CHECK: 00000000 D __imp__data
diff --git a/test/tools/llvm-objcopy/Inputs/dwarf.dwo b/test/tools/llvm-objcopy/Inputs/dwarf.dwo
new file mode 100644
index 00000000000..4b6fd505506
--- /dev/null
+++ b/test/tools/llvm-objcopy/Inputs/dwarf.dwo
Binary files differ
diff --git a/test/tools/llvm-objcopy/check-addr-offset-align-binary.test b/test/tools/llvm-objcopy/check-addr-offset-align-binary.test
new file mode 100644
index 00000000000..755acceeda2
--- /dev/null
+++ b/test/tools/llvm-objcopy/check-addr-offset-align-binary.test
@@ -0,0 +1,40 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objcopy -O binary %t %t2
+# RUN: od -t x1 %t2 | FileCheck %s
+
+!ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_EXEC
+ Machine: EM_X86_64
+Sections:
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1000
+ AddressAlign: 0x0000000000001000
+ Content: "c3c3c3c3"
+ - Name: .data
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Address: 0x1008
+ AddressAlign: 0x0000000000000008
+ Content: "3232"
+ProgramHeaders:
+ - Type: PT_LOAD
+ Flags: [ PF_X, PF_R ]
+ VAddr: 0x1000
+ PAddr: 0x1000
+ Align: 0x1000
+ Sections:
+ - Section: .text
+ - Type: PT_LOAD
+ Flags: [ PF_R, PF_W ]
+ VAddr: 0x1008
+ PAddr: 0x1008
+ Align: 0x1000
+ Sections:
+ - Section: .data
+
+# CHECK: 0000000 c3 c3 c3 c3 00 00 00 00 32 32
diff --git a/test/tools/llvm-objcopy/check-addr-offset-align.test b/test/tools/llvm-objcopy/check-addr-offset-align.test
new file mode 100644
index 00000000000..ca2367ba434
--- /dev/null
+++ b/test/tools/llvm-objcopy/check-addr-offset-align.test
@@ -0,0 +1,67 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objcopy %t %t2
+# RUN: llvm-readobj -program-headers %t2 | FileCheck %s
+
+!ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_EXEC
+ Machine: EM_X86_64
+Sections:
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1000
+ AddressAlign: 0x0000000000001000
+ Content: "c3c3c3c3"
+ - Name: .data
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Address: 0x1008
+ AddressAlign: 0x0000000000000008
+ Content: "3232"
+ProgramHeaders:
+ - Type: PT_LOAD
+ Flags: [ PF_X, PF_R ]
+ VAddr: 0x1000
+ PAddr: 0x1000
+ Align: 0x1000
+ Sections:
+ - Section: .text
+ - Type: PT_LOAD
+ Flags: [ PF_R, PF_W ]
+ VAddr: 0x1008
+ PAddr: 0x1008
+ Align: 0x1000
+ Sections:
+ - Section: .data
+
+#CHECK: ProgramHeaders [
+#CHECK-NEXT: ProgramHeader {
+#CHECK-NEXT: Type: PT_LOAD
+#CHECK-NEXT: Offset: 0x1000
+#CHECK-NEXT: VirtualAddress: 0x1000
+#CHECK-NEXT: PhysicalAddress: 0x1000
+#CHECK-NEXT: FileSize: 4
+#CHECK-NEXT: MemSize: 4
+#CHECK-NEXT: Flags [
+#CHECK-NEXT: PF_R
+#CHECK-NEXT: PF_X
+#CHECK-NEXT: ]
+#CHECK-NEXT: Alignment: 4096
+#CHECK-NEXT: }
+#CHECK-NEXT: ProgramHeader {
+#CHECK-NEXT: Type: PT_LOAD
+#CHECK-NEXT: Offset: 0x1008
+#CHECK-NEXT: VirtualAddress: 0x1008
+#CHECK-NEXT: PhysicalAddress: 0x1008
+#CHECK-NEXT: FileSize: 2
+#CHECK-NEXT: MemSize: 2
+#CHECK-NEXT: Flags [
+#CHECK-NEXT: PF_R
+#CHECK-NEXT: PF_W
+#CHECK-NEXT: ]
+#CHECK-NEXT: Alignment: 4096
+#CHECK-NEXT: }
+#CHECK-NEXT:]
diff --git a/test/tools/llvm-objcopy/drawf-fission.test b/test/tools/llvm-objcopy/drawf-fission.test
new file mode 100644
index 00000000000..112bffbc891
--- /dev/null
+++ b/test/tools/llvm-objcopy/drawf-fission.test
@@ -0,0 +1,43 @@
+# RUN: llvm-objcopy -extract-dwo %p/Inputs/dwarf.dwo %t
+# RUN: llvm-objcopy -strip-dwo %p/Inputs/dwarf.dwo %t2
+# RUN: llvm-objcopy -split-dwo=%t3 %p/Inputs/dwarf.dwo %t4
+# RUN: llvm-readobj -file-headers -sections %t | FileCheck %s -check-prefix=DWARF
+# RUN: llvm-readobj -file-headers -sections %t2 | FileCheck %s -check-prefix=STRIP
+# RUN: diff %t %t3
+# RUN: diff %t2 %t4
+
+#DWARF: SectionHeaderCount: 8
+
+#DWARF: Name: .debug_loc.dwo
+#DWARF: Name: .debug_str.dwo
+#DWARF: Name: .debug_str_offsets.dwo
+#DWARF: Name: .debug_info.dwo
+#DWARF: Name: .debug_abbrev.dwo
+#DWARF: Name: .debug_line.dwo
+#DWARF: Name: .strtab
+
+#STRIP: SectionHeaderCount: 24
+
+#STRIP: Name: .text
+#STRIP: Name: .rodata.str1.1
+#STRIP: Name: .debug_str
+#STRIP: Name: .debug_abbrev
+#STRIP: Name: .debug_info
+#STRIP: Name: .debug_ranges
+#STRIP: Name: .debug_macinfo
+#STRIP: Name: .debug_addr
+#STRIP: Name: .debug_pubnames
+#STRIP: Name: .debug_pubtypes
+#STRIP: Name: .comment
+#STRIP: Name: .note.GNU-stack
+#STRIP: Name: .debug_frame
+#STRIP: Name: .debug_line
+#STRIP: Name: .symtab
+#STRIP: Name: .rela.text
+#STRIP: Name: .rela.debug_info
+#STRIP: Name: .rela.debug_addr
+#STRIP: Name: .rela.debug_pubnames
+#STRIP: Name: .rela.debug_pubtypes
+#STRIP: Name: .rela.debug_frame
+#STRIP: Name: .rela.debug_line
+#STRIP: Name: .strtab
diff --git a/test/tools/llvm-objdump/X86/Inputs/macho-invalid-reloc-section-index b/test/tools/llvm-objdump/X86/Inputs/macho-invalid-reloc-section-index
new file mode 100644
index 00000000000..a9d0b48449b
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/macho-invalid-reloc-section-index
Binary files differ
diff --git a/test/tools/llvm-objdump/X86/malformed-machos.test b/test/tools/llvm-objdump/X86/malformed-machos.test
index 292666a3725..e29df464a4e 100644
--- a/test/tools/llvm-objdump/X86/malformed-machos.test
+++ b/test/tools/llvm-objdump/X86/malformed-machos.test
@@ -66,3 +66,6 @@ INVALID-SYMBOL-LIB_ORDINAL: macho-invalid-symbol-lib_ordinal': truncated or malf
RUN: not llvm-objdump -macho -objc-meta-data %p/Inputs/macho-invalid-bind-entry 2>&1 | FileCheck -check-prefix INVALID-BIND-ENTRY %s
INVALID-BIND-ENTRY: macho-invalid-bind-entry': truncated or malformed object (for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB bad library ordinal: 83 (max 0) for opcode at: 0x0)
+
+RUN: llvm-objdump -macho -r %p/Inputs/macho-invalid-reloc-section-index | FileCheck -check-prefix INVALID-RELOC-SECTION-INDEX %s
+INVALID-RELOC-SECTION-INDEX: 0000000000000021 X86_64_RELOC_UNSIGNED 8388613 (?,?)