aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid L. Jones <dlj@google.com>2017-11-15 01:40:05 +0000
committerDavid L. Jones <dlj@google.com>2017-11-15 01:40:05 +0000
commitd5c2cca72463233df77a065f201db31b140eb44d (patch)
tree3f9a978131033302a58b7db7db1ecf2a4622bad2
parentce7676b8db6bac096dad4c4ad62e9e6bb8aa1064 (diff)
parentdcf64df89bc6d775e266ebd6b0134d135f47a35b (diff)
Creating branches/google/testing and tags/google/testing/2017-11-14 from r317716testing
git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/google/testing@318248 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--CMakeLists.txt45
-rw-r--r--cmake/config-ix.cmake2
-rw-r--r--cmake/modules/AddOCaml.cmake1
-rw-r--r--cmake/modules/CrossCompile.cmake5
-rw-r--r--cmake/modules/HandleLLVMOptions.cmake5
-rw-r--r--cmake/modules/TableGen.cmake15
-rw-r--r--docs/CMake.rst4
-rw-r--r--docs/CMakeLists.txt11
-rw-r--r--docs/CommandGuide/FileCheck.rst5
-rw-r--r--docs/CommandGuide/dsymutil.rst89
-rw-r--r--docs/CommandGuide/index.rst1
-rw-r--r--docs/CommandGuide/lli.rst12
-rw-r--r--docs/CommandGuide/llvm-pdbutil.rst8
-rw-r--r--docs/GetElementPtr.rst2
-rw-r--r--docs/HowToCrossCompileBuiltinsOnArm.rst201
-rw-r--r--docs/LangRef.rst281
-rw-r--r--docs/SourceLevelDebugging.rst2
-rw-r--r--docs/WritingAnLLVMPass.rst4
-rw-r--r--docs/index.rst4
-rw-r--r--include/llvm-c/DebugInfo.h148
-rw-r--r--include/llvm/ADT/MapVector.h7
-rw-r--r--include/llvm/ADT/STLExtras.h7
-rw-r--r--include/llvm/Analysis/BlockFrequencyInfo.h4
-rw-r--r--include/llvm/Analysis/BlockFrequencyInfoImpl.h49
-rw-r--r--include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h2
-rw-r--r--include/llvm/CodeGen/GlobalISel/LegalizerInfo.h377
-rw-r--r--include/llvm/CodeGen/MachineBasicBlock.h10
-rw-r--r--include/llvm/CodeGen/MachineBlockFrequencyInfo.h2
-rw-r--r--include/llvm/CodeGen/MachineInstr.h15
-rw-r--r--include/llvm/CodeGen/Passes.h6
-rw-r--r--include/llvm/CodeGen/ResourcePriorityQueue.h2
-rw-r--r--include/llvm/CodeGen/StackMaps.h2
-rw-r--r--include/llvm/CodeGen/TailDuplicator.h4
-rw-r--r--include/llvm/CodeGen/TargetFrameLowering.h (renamed from include/llvm/Target/TargetFrameLowering.h)14
-rw-r--r--include/llvm/CodeGen/TargetInstrInfo.h (renamed from include/llvm/Target/TargetInstrInfo.h)2
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFDebugLine.h8
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFFormValue.h14
-rw-r--r--include/llvm/IR/BasicBlock.h2
-rw-r--r--include/llvm/IR/DebugInfoMetadata.h26
-rw-r--r--include/llvm/IR/Instruction.h24
-rw-r--r--include/llvm/IR/IntrinsicsNVVM.td7
-rw-r--r--include/llvm/IR/LLVMContext.h1
-rw-r--r--include/llvm/IR/MDBuilder.h3
-rw-r--r--include/llvm/IR/ModuleSummaryIndex.h12
-rw-r--r--include/llvm/IR/ModuleSummaryIndexYAML.h8
-rw-r--r--include/llvm/IR/Operator.h113
-rw-r--r--include/llvm/IR/Value.h6
-rw-r--r--include/llvm/InitializePasses.h4
-rw-r--r--include/llvm/LinkAllPasses.h1
-rw-r--r--include/llvm/MC/MCFragment.h11
-rw-r--r--include/llvm/Object/ELF.h263
-rw-r--r--include/llvm/ObjectYAML/COFFYAML.h10
-rw-r--r--include/llvm/ProfileData/GCOV.h (renamed from include/llvm/Support/GCOV.h)4
-rw-r--r--include/llvm/ProfileData/SampleProfReader.h2
-rw-r--r--include/llvm/Support/CMakeLists.txt1
-rw-r--r--include/llvm/Support/FileOutputBuffer.h6
-rw-r--r--include/llvm/Support/LowLevelTypeImpl.h45
-rw-r--r--include/llvm/Support/MemoryBuffer.h3
-rw-r--r--include/llvm/Support/SpecialCaseList.h37
-rw-r--r--include/llvm/Support/TargetParser.h8
-rw-r--r--include/llvm/Target/Target.td18
-rw-r--r--include/llvm/Target/TargetLowering.h9
-rw-r--r--include/llvm/Transforms/PGOInstrumentation.h2
-rw-r--r--include/llvm/Transforms/Scalar.h10
-rw-r--r--include/llvm/Transforms/Scalar/CallSiteSplitting.h29
-rw-r--r--include/llvm/Transforms/Utils/LoopUtils.h6
-rw-r--r--lib/Analysis/BasicAliasAnalysis.cpp6
-rw-r--r--lib/Analysis/BlockFrequencyInfo.cpp5
-rw-r--r--lib/Analysis/BlockFrequencyInfoImpl.cpp21
-rw-r--r--lib/Analysis/LoopAccessAnalysis.cpp45
-rw-r--r--lib/Analysis/ModuleSummaryAnalysis.cpp9
-rw-r--r--lib/Analysis/TypeBasedAliasAnalysis.cpp178
-rw-r--r--lib/Analysis/ValueTracking.cpp13
-rw-r--r--lib/AsmParser/LLLexer.cpp2
-rw-r--r--lib/AsmParser/LLParser.h4
-rw-r--r--lib/AsmParser/LLToken.h2
-rw-r--r--lib/Bitcode/Reader/BitcodeReader.cpp10
-rw-r--r--lib/Bitcode/Writer/BitcodeWriter.cpp8
-rw-r--r--lib/Bitcode/Writer/ValueEnumerator.h2
-rw-r--r--lib/CodeGen/AggressiveAntiDepBreaker.cpp2
-rw-r--r--lib/CodeGen/Analysis.cpp2
-rw-r--r--lib/CodeGen/AsmPrinter/ARMException.cpp2
-rw-r--r--lib/CodeGen/AsmPrinter/AsmPrinter.cpp4
-rw-r--r--lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp2
-rw-r--r--lib/CodeGen/AsmPrinter/CodeViewDebug.cpp2
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfCFIException.cpp2
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp2
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfFile.h2
-rw-r--r--lib/CodeGen/AsmPrinter/WinException.cpp2
-rw-r--r--lib/CodeGen/BranchFolding.cpp63
-rw-r--r--lib/CodeGen/BranchRelaxation.cpp2
-rw-r--r--lib/CodeGen/CFIInstrInserter.cpp319
-rw-r--r--lib/CodeGen/CMakeLists.txt3
-rw-r--r--lib/CodeGen/CalcSpillWeights.cpp2
-rw-r--r--lib/CodeGen/CodeGen.cpp3
-rw-r--r--lib/CodeGen/CodeGenPrepare.cpp1154
-rw-r--r--lib/CodeGen/CriticalAntiDepBreaker.cpp2
-rw-r--r--lib/CodeGen/DFAPacketizer.cpp2
-rw-r--r--lib/CodeGen/DeadMachineInstructionElim.cpp2
-rw-r--r--lib/CodeGen/DetectDeadLanes.cpp2
-rw-r--r--lib/CodeGen/EarlyIfConversion.cpp2
-rw-r--r--lib/CodeGen/ExecutionDepsFix.cpp2
-rw-r--r--lib/CodeGen/ExpandMemCmp.cpp828
-rw-r--r--lib/CodeGen/ExpandPostRAPseudos.cpp2
-rw-r--r--lib/CodeGen/ExpandReductions.cpp2
-rw-r--r--lib/CodeGen/FEntryInserter.cpp4
-rw-r--r--lib/CodeGen/GCRootLowering.cpp4
-rw-r--r--lib/CodeGen/GlobalISel/IRTranslator.cpp2
-rw-r--r--lib/CodeGen/GlobalISel/Legalizer.cpp2
-rw-r--r--lib/CodeGen/GlobalISel/LegalizerHelper.cpp74
-rw-r--r--lib/CodeGen/GlobalISel/LegalizerInfo.cpp383
-rw-r--r--lib/CodeGen/GlobalISel/MachineIRBuilder.cpp2
-rw-r--r--lib/CodeGen/GlobalISel/RegisterBankInfo.cpp2
-rw-r--r--lib/CodeGen/GlobalISel/Utils.cpp2
-rw-r--r--lib/CodeGen/IfConversion.cpp2
-rw-r--r--lib/CodeGen/ImplicitNullChecks.cpp2
-rw-r--r--lib/CodeGen/InlineSpiller.cpp2
-rw-r--r--lib/CodeGen/LiveDebugValues.cpp4
-rw-r--r--lib/CodeGen/LiveDebugVariables.cpp2
-rw-r--r--lib/CodeGen/LiveRangeEdit.cpp2
-rw-r--r--lib/CodeGen/LiveVariables.cpp2
-rw-r--r--lib/CodeGen/LocalStackSlotAllocation.cpp2
-rw-r--r--lib/CodeGen/MIRCanonicalizerPass.cpp626
-rw-r--r--lib/CodeGen/MIRParser/MIParser.cpp6
-rw-r--r--lib/CodeGen/MIRPrinter.cpp33
-rw-r--r--lib/CodeGen/MachineBasicBlock.cpp10
-rw-r--r--lib/CodeGen/MachineBlockFrequencyInfo.cpp6
-rw-r--r--lib/CodeGen/MachineBlockPlacement.cpp2
-rw-r--r--lib/CodeGen/MachineCSE.cpp2
-rw-r--r--lib/CodeGen/MachineCombiner.cpp2
-rw-r--r--lib/CodeGen/MachineCopyPropagation.cpp2
-rw-r--r--lib/CodeGen/MachineFrameInfo.cpp4
-rw-r--r--lib/CodeGen/MachineFunction.cpp2
-rw-r--r--lib/CodeGen/MachineInstr.cpp52
-rw-r--r--lib/CodeGen/MachineInstrBundle.cpp2
-rw-r--r--lib/CodeGen/MachineLICM.cpp2
-rw-r--r--lib/CodeGen/MachineOutliner.cpp2
-rw-r--r--lib/CodeGen/MachinePipeliner.cpp2
-rw-r--r--lib/CodeGen/MachineRegisterInfo.cpp2
-rw-r--r--lib/CodeGen/MachineSSAUpdater.cpp2
-rw-r--r--lib/CodeGen/MachineScheduler.cpp2
-rw-r--r--lib/CodeGen/MachineSink.cpp2
-rw-r--r--lib/CodeGen/MachineVerifier.cpp2
-rw-r--r--lib/CodeGen/MacroFusion.cpp2
-rw-r--r--lib/CodeGen/OptimizePHIs.cpp2
-rw-r--r--lib/CodeGen/PHIElimination.cpp2
-rw-r--r--lib/CodeGen/PatchableFunction.cpp4
-rw-r--r--lib/CodeGen/PeepholeOptimizer.cpp2
-rw-r--r--lib/CodeGen/PostRAHazardRecognizer.cpp2
-rw-r--r--lib/CodeGen/PostRASchedulerList.cpp2
-rw-r--r--lib/CodeGen/ProcessImplicitDefs.cpp2
-rw-r--r--lib/CodeGen/PrologEpilogInserter.cpp20
-rw-r--r--lib/CodeGen/PseudoSourceValue.cpp2
-rw-r--r--lib/CodeGen/RegAllocFast.cpp2
-rw-r--r--lib/CodeGen/RegAllocGreedy.cpp2
-rw-r--r--lib/CodeGen/RegUsageInfoCollector.cpp2
-rw-r--r--lib/CodeGen/RegisterClassInfo.cpp2
-rw-r--r--lib/CodeGen/RegisterCoalescer.cpp4
-rw-r--r--lib/CodeGen/RegisterScavenging.cpp4
-rw-r--r--lib/CodeGen/RenameIndependentSubregs.cpp2
-rw-r--r--lib/CodeGen/ScheduleDAG.cpp2
-rw-r--r--lib/CodeGen/ScoreboardHazardRecognizer.cpp2
-rw-r--r--lib/CodeGen/SelectionDAG/FastISel.cpp2
-rw-r--r--lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp4
-rw-r--r--lib/CodeGen/SelectionDAG/InstrEmitter.cpp2
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeDAG.cpp2
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeTypes.cpp13
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp2
-rw-r--r--lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp2
-rw-r--r--lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp2
-rw-r--r--lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp2
-rw-r--r--lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp2
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAG.cpp6
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp28
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp2
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp4
-rw-r--r--lib/CodeGen/SelectionDAG/TargetLowering.cpp13
-rw-r--r--lib/CodeGen/ShrinkWrap.cpp6
-rw-r--r--lib/CodeGen/SlotIndexes.cpp2
-rw-r--r--lib/CodeGen/SplitKit.cpp2
-rw-r--r--lib/CodeGen/StackSlotColoring.cpp2
-rw-r--r--lib/CodeGen/TailDuplicator.cpp10
-rw-r--r--lib/CodeGen/TargetFrameLoweringImpl.cpp11
-rw-r--r--lib/CodeGen/TargetInstrInfo.cpp4
-rw-r--r--lib/CodeGen/TargetOptionsImpl.cpp2
-rw-r--r--lib/CodeGen/TargetPassConfig.cpp10
-rw-r--r--lib/CodeGen/TargetRegisterInfo.cpp2
-rw-r--r--lib/CodeGen/TargetSchedule.cpp2
-rw-r--r--lib/CodeGen/TargetSubtargetInfo.cpp2
-rw-r--r--lib/CodeGen/TwoAddressInstructionPass.cpp2
-rw-r--r--lib/CodeGen/UnreachableBlockElim.cpp2
-rw-r--r--lib/CodeGen/VirtRegMap.cpp2
-rw-r--r--lib/CodeGen/XRayInstrumentation.cpp2
-rw-r--r--lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp2
-rw-r--r--lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp10
-rw-r--r--lib/DebugInfo/DWARF/DWARFContext.cpp8
-rw-r--r--lib/DebugInfo/DWARF/DWARFDebugLine.cpp20
-rw-r--r--lib/DebugInfo/DWARF/DWARFDie.cpp5
-rw-r--r--lib/DebugInfo/DWARF/DWARFFormValue.cpp13
-rw-r--r--lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp5
-rw-r--r--lib/IR/AsmWriter.cpp8
-rw-r--r--lib/IR/AutoUpgrade.cpp11
-rw-r--r--lib/IR/BasicBlock.cpp13
-rw-r--r--lib/IR/CMakeLists.txt1
-rw-r--r--lib/IR/DebugInfo.cpp20
-rw-r--r--lib/IR/DebugInfoMetadata.cpp47
-rw-r--r--lib/IR/Instruction.cpp30
-rw-r--r--lib/IR/LLVMContext.cpp1
-rw-r--r--lib/IR/MDBuilder.cpp7
-rw-r--r--lib/IR/Value.cpp22
-rw-r--r--lib/IR/Verifier.cpp2
-rw-r--r--lib/LTO/LTO.cpp21
-rw-r--r--lib/LTO/LTOCodeGenerator.cpp20
-rw-r--r--lib/MC/MCAssembler.cpp6
-rw-r--r--lib/MC/MCFragment.cpp10
-rw-r--r--lib/MC/MCWinCOFFStreamer.cpp2
-rw-r--r--lib/Object/ArchiveWriter.cpp64
-rw-r--r--lib/Object/ELF.cpp263
-rw-r--r--lib/ObjectYAML/COFFYAML.cpp48
-rw-r--r--lib/Passes/PassBuilder.cpp9
-rw-r--r--lib/Passes/PassRegistry.def1
-rw-r--r--lib/ProfileData/CMakeLists.txt1
-rw-r--r--lib/ProfileData/GCOV.cpp (renamed from lib/IR/GCOV.cpp)2
-rw-r--r--lib/Support/Chrono.cpp6
-rw-r--r--lib/Support/FileOutputBuffer.cpp28
-rw-r--r--lib/Support/Host.cpp93
-rw-r--r--lib/Support/LowLevelType.cpp2
-rw-r--r--lib/Support/SpecialCaseList.cpp83
-rw-r--r--lib/Support/Unix/Path.inc2
-rw-r--r--lib/Target/AArch64/AArch64A53Fix835769.cpp2
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.h2
-rw-r--r--lib/Target/AArch64/AArch64CondBrTuning.cpp2
-rw-r--r--lib/Target/AArch64/AArch64ConditionOptimizer.cpp2
-rw-r--r--lib/Target/AArch64/AArch64ConditionalCompares.cpp2
-rw-r--r--lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp2
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.cpp2
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.h2
-rw-r--r--lib/Target/AArch64/AArch64GenRegisterBankInfo.def106
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp4
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.h2
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.td2
-rw-r--r--lib/Target/AArch64/AArch64LegalizerInfo.cpp169
-rw-r--r--lib/Target/AArch64/AArch64MacroFusion.cpp2
-rw-r--r--lib/Target/AArch64/AArch64RegisterBankInfo.cpp40
-rw-r--r--lib/Target/AArch64/AArch64RegisterBankInfo.h27
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.cpp2
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.td131
-rw-r--r--lib/Target/AArch64/AArch64SVEInstrInfo.td17
-rw-r--r--lib/Target/AArch64/AArch64SchedA53.td2
-rw-r--r--lib/Target/AArch64/AArch64SchedA57.td2
-rw-r--r--lib/Target/AArch64/AArch64SchedCyclone.td2
-rw-r--r--lib/Target/AArch64/AArch64SchedFalkor.td2
-rw-r--r--lib/Target/AArch64/AArch64SchedKryo.td2
-rw-r--r--lib/Target/AArch64/AArch64SchedM1.td2
-rw-r--r--lib/Target/AArch64/AArch64SchedThunderX.td2
-rw-r--r--lib/Target/AArch64/AArch64SchedThunderX2T99.td2
-rw-r--r--lib/Target/AArch64/AArch64StorePairSuppress.cpp2
-rw-r--r--lib/Target/AArch64/AArch64VectorByElementOpt.cpp2
-rw-r--r--lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp266
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp24
-rw-r--r--lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp20
-rw-r--r--lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h3
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp4
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp8
-rw-r--r--lib/Target/AArch64/SVEInstrFormats.td41
-rw-r--r--lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUFrameLowering.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp19
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp57
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h5
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPULibCalls.cpp6
-rw-r--r--lib/Target/AMDGPU/AMDGPULibFunc.cpp29
-rw-r--r--lib/Target/AMDGPU/AMDGPULibFunc.h20
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp4
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp62
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.h8
-rw-r--r--lib/Target/AMDGPU/VOP3Instructions.td2
-rw-r--r--lib/Target/ARC/ARCBranchFinalize.cpp2
-rw-r--r--lib/Target/ARC/ARCFrameLowering.h2
-rw-r--r--lib/Target/ARC/ARCInstrInfo.h2
-rw-r--r--lib/Target/ARC/ARCRegisterInfo.cpp2
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.cpp2
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.h2
-rw-r--r--lib/Target/ARM/ARMBaseRegisterInfo.cpp2
-rw-r--r--lib/Target/ARM/ARMCallLowering.cpp12
-rw-r--r--lib/Target/ARM/ARMCallingConv.h2
-rw-r--r--lib/Target/ARM/ARMFastISel.cpp2
-rw-r--r--lib/Target/ARM/ARMFrameLowering.cpp2
-rw-r--r--lib/Target/ARM/ARMFrameLowering.h2
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp2
-rw-r--r--lib/Target/ARM/ARMLegalizerInfo.cpp71
-rw-r--r--lib/Target/ARM/ARMLoadStoreOptimizer.cpp6
-rw-r--r--lib/Target/ARM/ARMMacroFusion.cpp2
-rw-r--r--lib/Target/ARM/Thumb1FrameLowering.cpp2
-rw-r--r--lib/Target/ARM/Thumb2SizeReduction.cpp2
-rw-r--r--lib/Target/ARM/ThumbRegisterInfo.cpp2
-rw-r--r--lib/Target/AVR/AVRFrameLowering.h2
-rw-r--r--lib/Target/AVR/AVRInstrInfo.h2
-rw-r--r--lib/Target/AVR/AVRRegisterInfo.cpp2
-rw-r--r--lib/Target/BPF/BPFFrameLowering.h2
-rw-r--r--lib/Target/BPF/BPFInstrInfo.h2
-rw-r--r--lib/Target/BPF/BPFRegisterInfo.cpp4
-rw-r--r--lib/Target/Hexagon/HexagonCFGOptimizer.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonFixupHwLoops.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonFrameLowering.h2
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.h2
-rw-r--r--lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp69
-rw-r--r--lib/Target/Hexagon/HexagonMachineScheduler.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonMachineScheduler.h2
-rw-r--r--lib/Target/Hexagon/HexagonPatterns.td134
-rw-r--r--lib/Target/Hexagon/HexagonPeephole.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonRegisterInfo.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp15
-rw-r--r--lib/Target/Hexagon/RDFGraph.cpp2
-rw-r--r--lib/Target/Lanai/LanaiDelaySlotFiller.cpp2
-rw-r--r--lib/Target/Lanai/LanaiFrameLowering.h2
-rw-r--r--lib/Target/Lanai/LanaiInstrInfo.h2
-rw-r--r--lib/Target/Lanai/LanaiMemAluCombiner.cpp2
-rw-r--r--lib/Target/Lanai/LanaiRegisterInfo.cpp4
-rw-r--r--lib/Target/Lanai/LanaiSubtarget.h2
-rw-r--r--lib/Target/Lanai/LanaiTargetMachine.h2
-rw-r--r--lib/Target/MSP430/MSP430FrameLowering.h2
-rw-r--r--lib/Target/MSP430/MSP430InstrInfo.h2
-rw-r--r--lib/Target/MSP430/MSP430TargetMachine.h2
-rw-r--r--lib/Target/Mips/Disassembler/MipsDisassembler.cpp6
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp23
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h3
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp8
-rw-r--r--lib/Target/Mips/MicroMips32r6InstrFormats.td15
-rw-r--r--lib/Target/Mips/MicroMips32r6InstrInfo.td11
-rw-r--r--lib/Target/Mips/MicroMips64r6InstrInfo.td7
-rw-r--r--lib/Target/Mips/MicroMipsInstrInfo.td15
-rw-r--r--lib/Target/Mips/Mips16FrameLowering.cpp2
-rw-r--r--lib/Target/Mips/Mips16ISelLowering.cpp2
-rw-r--r--lib/Target/Mips/Mips16RegisterInfo.cpp4
-rw-r--r--lib/Target/Mips/Mips64InstrInfo.td6
-rw-r--r--lib/Target/Mips/MipsFastISel.cpp2
-rw-r--r--lib/Target/Mips/MipsFrameLowering.cpp35
-rw-r--r--lib/Target/Mips/MipsFrameLowering.h2
-rw-r--r--lib/Target/Mips/MipsISelLowering.cpp6
-rw-r--r--lib/Target/Mips/MipsInstrInfo.h2
-rw-r--r--lib/Target/Mips/MipsInstrInfo.td26
-rw-r--r--lib/Target/Mips/MipsOptimizePICCall.cpp2
-rw-r--r--lib/Target/Mips/MipsRegisterInfo.cpp2
-rw-r--r--lib/Target/Mips/MipsRegisterInfo.td1
-rw-r--r--lib/Target/Mips/MipsSEFrameLowering.cpp10
-rw-r--r--lib/Target/Mips/MipsSEISelDAGToDAG.cpp58
-rw-r--r--lib/Target/Mips/MipsSEISelLowering.cpp2
-rw-r--r--lib/Target/Mips/MipsSERegisterInfo.cpp4
-rw-r--r--lib/Target/Mips/MipsScheduleGeneric.td1
-rw-r--r--lib/Target/NVPTX/NVPTXFrameLowering.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXFrameLowering.h2
-rw-r--r--lib/Target/NVPTX/NVPTXISelLowering.cpp1
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.h2
-rw-r--r--lib/Target/NVPTX/NVPTXIntrinsics.td13
-rw-r--r--lib/Target/NVPTX/NVPTXPeephole.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXRegisterInfo.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXTargetMachine.h2
-rw-r--r--lib/Target/Nios2/Nios2FrameLowering.h2
-rw-r--r--lib/Target/Nios2/Nios2InstrInfo.h2
-rw-r--r--lib/Target/PowerPC/PPCBranchCoalescing.cpp4
-rw-r--r--lib/Target/PowerPC/PPCFrameLowering.h2
-rw-r--r--lib/Target/PowerPC/PPCISelDAGToDAG.cpp2
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp164
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.h6
-rw-r--r--lib/Target/PowerPC/PPCInstrAltivec.td7
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.h2
-rw-r--r--lib/Target/PowerPC/PPCInstrVSX.td7
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.cpp4
-rw-r--r--lib/Target/RISCV/RISCV.h12
-rw-r--r--lib/Target/RISCV/RISCV.td4
-rw-r--r--lib/Target/RISCV/RISCVAsmPrinter.cpp7
-rw-r--r--lib/Target/RISCV/RISCVCallingConv.td3
-rw-r--r--lib/Target/RISCV/RISCVFrameLowering.h8
-rw-r--r--lib/Target/RISCV/RISCVISelLowering.cpp161
-rw-r--r--lib/Target/RISCV/RISCVISelLowering.h6
-rw-r--r--lib/Target/RISCV/RISCVInstrInfo.cpp48
-rw-r--r--lib/Target/RISCV/RISCVInstrInfo.h18
-rw-r--r--lib/Target/RISCV/RISCVInstrInfo.td120
-rw-r--r--lib/Target/RISCV/RISCVMCInstLower.cpp83
-rw-r--r--lib/Target/RISCV/RISCVRegisterInfo.cpp41
-rw-r--r--lib/Target/RISCV/RISCVRegisterInfo.h5
-rw-r--r--lib/Target/Sparc/DelaySlotFiller.cpp2
-rw-r--r--lib/Target/Sparc/SparcFrameLowering.h2
-rw-r--r--lib/Target/Sparc/SparcInstrInfo.h2
-rw-r--r--lib/Target/Sparc/SparcRegisterInfo.cpp2
-rw-r--r--lib/Target/Sparc/SparcSubtarget.h2
-rw-r--r--lib/Target/SystemZ/SystemZFrameLowering.h2
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.cpp2
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.h2
-rw-r--r--lib/Target/SystemZ/SystemZLDCleanup.cpp2
-rw-r--r--lib/Target/SystemZ/SystemZRegisterInfo.cpp2
-rw-r--r--lib/Target/SystemZ/SystemZTargetTransformInfo.cpp5
-rw-r--r--lib/Target/SystemZ/SystemZTargetTransformInfo.h1
-rw-r--r--lib/Target/TargetMachine.cpp7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp11
-rw-r--r--lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp20
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFastISel.cpp2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFrameLowering.h2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInfo.h2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp2
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp4
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp6
-rw-r--r--lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp6
-rw-r--r--lib/Target/X86/X86.td14
-rw-r--r--lib/Target/X86/X86CallFrameOptimization.cpp2
-rw-r--r--lib/Target/X86/X86CallLowering.cpp2
-rw-r--r--lib/Target/X86/X86CmovConversion.cpp2
-rw-r--r--lib/Target/X86/X86EvexToVex.cpp20
-rw-r--r--lib/Target/X86/X86FixupBWInsts.cpp2
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp2
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp2
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp48
-rw-r--r--lib/Target/X86/X86FrameLowering.h6
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp12
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp305
-rw-r--r--lib/Target/X86/X86ISelLowering.h12
-rw-r--r--lib/Target/X86/X86InstrAVX512.td331
-rw-r--r--lib/Target/X86/X86InstrFMA.td80
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td22
-rw-r--r--lib/Target/X86/X86InstrInfo.h2
-rw-r--r--lib/Target/X86/X86InstrInfo.td1
-rw-r--r--lib/Target/X86/X86InstrSSE.td104
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h78
-rw-r--r--lib/Target/X86/X86LegalizerInfo.cpp66
-rw-r--r--lib/Target/X86/X86MacroFusion.cpp2
-rw-r--r--lib/Target/X86/X86PadShortFunction.cpp2
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp4
-rw-r--r--lib/Target/X86/X86Subtarget.cpp2
-rw-r--r--lib/Target/X86/X86Subtarget.h2
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp7
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp7
-rw-r--r--lib/Target/X86/X86VZeroUpper.cpp2
-rw-r--r--lib/Target/X86/X86WinAllocaExpander.cpp2
-rw-r--r--lib/Target/XCore/XCoreFrameLowering.h2
-rw-r--r--lib/Target/XCore/XCoreInstrInfo.h2
-rw-r--r--lib/Target/XCore/XCoreRegisterInfo.cpp2
-rw-r--r--lib/Transforms/IPO/GlobalOpt.cpp10
-rw-r--r--lib/Transforms/IPO/LowerTypeTests.cpp2
-rw-r--r--lib/Transforms/IPO/PartialInlining.cpp16
-rw-r--r--lib/Transforms/IPO/PassManagerBuilder.cpp9
-rw-r--r--lib/Transforms/IPO/SampleProfile.cpp26
-rw-r--r--lib/Transforms/InstCombine/InstCombineAddSub.cpp8
-rw-r--r--lib/Transforms/InstCombine/InstCombineCalls.cpp2
-rw-r--r--lib/Transforms/InstCombine/InstCombineCompares.cpp3
-rw-r--r--lib/Transforms/InstCombine/InstCombineMulDivRem.cpp11
-rw-r--r--lib/Transforms/InstCombine/InstCombineShifts.cpp109
-rw-r--r--lib/Transforms/Instrumentation/PGOInstrumentation.cpp28
-rw-r--r--lib/Transforms/Scalar/ADCE.cpp4
-rw-r--r--lib/Transforms/Scalar/CMakeLists.txt1
-rw-r--r--lib/Transforms/Scalar/CallSiteSplitting.cpp492
-rw-r--r--lib/Transforms/Scalar/IndVarSimplify.cpp10
-rw-r--r--lib/Transforms/Scalar/JumpThreading.cpp7
-rw-r--r--lib/Transforms/Scalar/LICM.cpp196
-rw-r--r--lib/Transforms/Scalar/LoopIdiomRecognize.cpp6
-rw-r--r--lib/Transforms/Scalar/LoopPredication.cpp234
-rw-r--r--lib/Transforms/Scalar/LoopStrengthReduce.cpp2
-rw-r--r--lib/Transforms/Scalar/Reassociate.cpp12
-rw-r--r--lib/Transforms/Scalar/RewriteStatepointsForGC.cpp42
-rw-r--r--lib/Transforms/Scalar/SROA.cpp6
-rw-r--r--lib/Transforms/Scalar/Scalar.cpp1
-rw-r--r--lib/Transforms/Utils/FunctionImportUtils.cpp17
-rw-r--r--lib/Transforms/Utils/Local.cpp37
-rw-r--r--lib/Transforms/Utils/LoopUtils.cpp12
-rw-r--r--lib/Transforms/Utils/SimplifyCFG.cpp4
-rw-r--r--lib/Transforms/Utils/SimplifyLibCalls.cpp37
-rw-r--r--lib/Transforms/Utils/SplitModule.cpp18
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp6
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp4
-rw-r--r--test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll208
-rw-r--r--test/Analysis/CostModel/X86/interleaved-load-float.ll141
-rw-r--r--test/Assembler/fast-math-flags.ll32
-rw-r--r--test/Bitcode/compatibility-3.6.ll4
-rw-r--r--test/Bitcode/compatibility-3.7.ll4
-rw-r--r--test/Bitcode/compatibility-3.8.ll8
-rw-r--r--test/Bitcode/compatibility-3.9.ll8
-rw-r--r--test/Bitcode/compatibility-4.0.ll8
-rw-r--r--test/Bitcode/compatibility-5.0.ll8
-rw-r--r--test/Bitcode/compatibility.ll4
-rw-r--r--test/Bitcode/thinlto-summary-local-5.0.ll22
-rw-r--r--test/Bitcode/thinlto-summary-local-5.0.ll.bcbin0 -> 1028 bytes
-rw-r--r--test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll67
-rw-r--r--test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir104
-rw-r--r--test/CodeGen/AArch64/GlobalISel/legalize-add.mir91
-rw-r--r--test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir19
-rw-r--r--test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir4
-rw-r--r--test/CodeGen/AArch64/GlobalISel/select-int-ext.mir6
-rw-r--r--test/CodeGen/AArch64/dwarf-cfi.ll36
-rw-r--r--test/CodeGen/AArch64/recp-fastmath.ll34
-rw-r--r--test/CodeGen/AArch64/sqrt-fastmath.ll83
-rw-r--r--test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir12
-rw-r--r--test/CodeGen/AMDGPU/detect-dead-lanes.mir44
-rw-r--r--test/CodeGen/AMDGPU/mad_64_32.ll168
-rw-r--r--test/CodeGen/AMDGPU/mul.ll125
-rw-r--r--test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir32
-rw-r--r--test/CodeGen/AMDGPU/private-memory-r600.ll249
-rw-r--r--test/CodeGen/AMDGPU/simplify-libcalls.ll122
-rw-r--r--test/CodeGen/AMDGPU/unknown-processor.ll9
-rw-r--r--test/CodeGen/AMDGPU/unsupported-calls.ll4
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir35
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir16
-rw-r--r--test/CodeGen/Generic/llc-start-stop.ll6
-rw-r--r--test/CodeGen/Hexagon/isel-prefer.ll10
-rw-r--r--test/CodeGen/MIR/X86/subregister-index-operands.mir6
-rw-r--r--test/CodeGen/Mips/brind-tailcall.ll60
-rw-r--r--test/CodeGen/Mips/dins.ll14
-rw-r--r--test/CodeGen/Mips/msa/emergency-spill.mir221
-rw-r--r--test/CodeGen/Mips/msa/frameindex.ll49
-rw-r--r--test/CodeGen/Mips/tailcall/tailcall.ll15
-rw-r--r--test/CodeGen/NVPTX/atomics-sm60.ll19
-rw-r--r--test/CodeGen/NVPTX/generic-to-nvvm-ir.ll2
-rw-r--r--test/CodeGen/PowerPC/bswap64.ll13
-rw-r--r--test/CodeGen/PowerPC/p9-vinsert-vextract.ll822
-rw-r--r--test/CodeGen/PowerPC/subreg-postra-2.ll8
-rw-r--r--test/CodeGen/RISCV/alu32.ll1
-rw-r--r--test/CodeGen/RISCV/branch.ll121
-rw-r--r--test/CodeGen/RISCV/calls.ll83
-rw-r--r--test/CodeGen/RISCV/imm.ll47
-rw-r--r--test/CodeGen/RISCV/mem.ll202
-rw-r--r--test/CodeGen/RISCV/wide-mem.ll34
-rw-r--r--test/CodeGen/WebAssembly/inline-asm-m.ll13
-rw-r--r--test/CodeGen/WebAssembly/inline-asm.ll56
-rw-r--r--test/CodeGen/WebAssembly/signext-arg.ll22
-rw-r--r--test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll1
-rw-r--r--test/CodeGen/X86/2011-10-19-widen_vselect.ll1
-rw-r--r--test/CodeGen/X86/GlobalISel/add-scalar.ll1
-rw-r--r--test/CodeGen/X86/GlobalISel/brcond.ll1
-rw-r--r--test/CodeGen/X86/GlobalISel/callingconv.ll21
-rw-r--r--test/CodeGen/X86/GlobalISel/frameIndex.ll1
-rw-r--r--test/CodeGen/X86/GlobalISel/select-cmp.mir26
-rw-r--r--test/CodeGen/X86/GlobalISel/select-copy.mir6
-rw-r--r--test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir10
-rw-r--r--test/CodeGen/X86/GlobalISel/select-ext.mir12
-rw-r--r--test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir2
-rw-r--r--test/CodeGen/X86/O0-pipeline.ll1
-rw-r--r--test/CodeGen/X86/TruncAssertZext.ll1
-rw-r--r--test/CodeGen/X86/avg.ll185
-rw-r--r--test/CodeGen/X86/avx-basic.ll7
-rw-r--r--test/CodeGen/X86/avx-intrinsics-x86.ll52
-rw-r--r--test/CodeGen/X86/avx-schedule.ll8
-rw-r--r--test/CodeGen/X86/avx512-mask-op.ll10
-rw-r--r--test/CodeGen/X86/avx512-regcall-Mask.ll22
-rw-r--r--test/CodeGen/X86/avx512-regcall-NoMask.ll17
-rwxr-xr-xtest/CodeGen/X86/avx512-schedule.ll4
-rw-r--r--test/CodeGen/X86/avx512-select.ll1
-rwxr-xr-xtest/CodeGen/X86/avx512-shuffle-schedule.ll736
-rw-r--r--test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll368
-rw-r--r--test/CodeGen/X86/avx512-skx-insert-subvec.ll2
-rw-r--r--test/CodeGen/X86/avx512-vbroadcast.ll2
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll2
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll12
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics.ll4
-rw-r--r--test/CodeGen/X86/avx512bw-vec-test-testn.ll32
-rw-r--r--test/CodeGen/X86/avx512bwvl-vec-test-testn.ll64
-rw-r--r--test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll37
-rw-r--r--test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll23
-rw-r--r--test/CodeGen/X86/avx512cd-intrinsics.ll22
-rw-r--r--test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll44
-rw-r--r--test/CodeGen/X86/avx512cdvl-intrinsics.ll43
-rw-r--r--test/CodeGen/X86/avx512f-vec-test-testn.ll32
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll30
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll4
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics.ll12
-rw-r--r--test/CodeGen/X86/avx512vl-vbroadcast.ll3
-rw-r--r--test/CodeGen/X86/avx512vl-vec-masked-cmp.ll520
-rw-r--r--test/CodeGen/X86/avx512vl-vec-test-testn.ll128
-rw-r--r--test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll75
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-256.ll1
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-512.ll4
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll6
-rw-r--r--test/CodeGen/X86/bitcast-setcc-256.ll1
-rw-r--r--test/CodeGen/X86/bitcast-setcc-512.ll4
-rw-r--r--test/CodeGen/X86/bool-vector.ll3
-rw-r--r--test/CodeGen/X86/broadcastm-lowering.ll7
-rw-r--r--test/CodeGen/X86/cmp.ll3
-rw-r--r--test/CodeGen/X86/combine-srl.ll2
-rw-r--r--test/CodeGen/X86/compress_expand.ll4
-rw-r--r--test/CodeGen/X86/emutls-pie.ll6
-rw-r--r--test/CodeGen/X86/emutls.ll16
-rw-r--r--test/CodeGen/X86/epilogue-cfi-fp.ll43
-rw-r--r--test/CodeGen/X86/epilogue-cfi-no-fp.ll46
-rw-r--r--test/CodeGen/X86/f16c-intrinsics.ll271
-rw-r--r--test/CodeGen/X86/fast-isel-int-float-conversion.ll12
-rw-r--r--test/CodeGen/X86/fast-isel-store.ll10
-rw-r--r--test/CodeGen/X86/fma-intrinsics-x86.ll874
-rw-r--r--test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll18
-rw-r--r--test/CodeGen/X86/frame-lowering-debug-intrinsic.ll4
-rw-r--r--test/CodeGen/X86/haddsub-2.ll12
-rw-r--r--test/CodeGen/X86/hipe-cc64.ll1
-rw-r--r--test/CodeGen/X86/horizontal-reduce-smax.ll1896
-rw-r--r--test/CodeGen/X86/horizontal-reduce-smin.ll1898
-rw-r--r--test/CodeGen/X86/horizontal-reduce-umax.ll2203
-rw-r--r--test/CodeGen/X86/horizontal-reduce-umin.ll2207
-rw-r--r--test/CodeGen/X86/illegal-bitfield-loadstore.ll1
-rw-r--r--test/CodeGen/X86/imul.ll3
-rw-r--r--test/CodeGen/X86/inline-asm-A-constraint.ll3
-rw-r--r--test/CodeGen/X86/lea-opt-cse1.ll1
-rw-r--r--test/CodeGen/X86/lea-opt-cse2.ll2
-rw-r--r--test/CodeGen/X86/lea-opt-cse3.ll2
-rw-r--r--test/CodeGen/X86/lea-opt-cse4.ll3
-rw-r--r--test/CodeGen/X86/legalize-shift-64.ll5
-rw-r--r--test/CodeGen/X86/live-out-reg-info.ll1
-rw-r--r--test/CodeGen/X86/load-combine.ll2
-rw-r--r--test/CodeGen/X86/masked_gather_scatter.ll34
-rw-r--r--test/CodeGen/X86/masked_memop.ll12
-rw-r--r--test/CodeGen/X86/memcmp-optsize.ll224
-rw-r--r--test/CodeGen/X86/memcmp.ll240
-rw-r--r--test/CodeGen/X86/memset-nonzero.ll1
-rw-r--r--test/CodeGen/X86/merge-consecutive-loads-128.ll19
-rw-r--r--test/CodeGen/X86/movtopush.ll4
-rw-r--r--test/CodeGen/X86/mul-constant-result.ll59
-rw-r--r--test/CodeGen/X86/mul-i256.ll8
-rw-r--r--test/CodeGen/X86/mul128.ll5
-rw-r--r--test/CodeGen/X86/no-plt.ll30
-rw-r--r--test/CodeGen/X86/pop-stack-cleanup-msvc.ll26
-rw-r--r--test/CodeGen/X86/pr21792.ll1
-rw-r--r--test/CodeGen/X86/pr29061.ll2
-rw-r--r--test/CodeGen/X86/pr29112.ll1
-rw-r--r--test/CodeGen/X86/pr30430.ll1
-rw-r--r--test/CodeGen/X86/pr32241.ll2
-rw-r--r--test/CodeGen/X86/pr32256.ll1
-rw-r--r--test/CodeGen/X86/pr32282.ll1
-rw-r--r--test/CodeGen/X86/pr32284.ll16
-rw-r--r--test/CodeGen/X86/pr32329.ll4
-rw-r--r--test/CodeGen/X86/pr32345.ll2
-rw-r--r--test/CodeGen/X86/pr32451.ll2
-rw-r--r--test/CodeGen/X86/pr34088.ll1
-rw-r--r--test/CodeGen/X86/pr34653.ll210
-rw-r--r--test/CodeGen/X86/pr34657.ll20
-rw-r--r--test/CodeGen/X86/pr9743.ll1
-rw-r--r--test/CodeGen/X86/push-cfi-debug.ll4
-rw-r--r--test/CodeGen/X86/push-cfi-obj.ll7
-rw-r--r--test/CodeGen/X86/push-cfi.ll3
-rw-r--r--test/CodeGen/X86/recip-fastmath.ll16
-rw-r--r--test/CodeGen/X86/recip-fastmath2.ll32
-rw-r--r--test/CodeGen/X86/return-ext.ll3
-rw-r--r--test/CodeGen/X86/rtm.ll1
-rw-r--r--test/CodeGen/X86/schedule-x86_32.ll348
-rw-r--r--test/CodeGen/X86/schedule-x86_64.ll737
-rw-r--r--test/CodeGen/X86/select-mmx.ll2
-rw-r--r--test/CodeGen/X86/select.ll38
-rw-r--r--test/CodeGen/X86/setcc-lowering.ll8
-rw-r--r--test/CodeGen/X86/shrink_vmul.ll13
-rw-r--r--test/CodeGen/X86/sse-intrinsics-x86.ll52
-rw-r--r--test/CodeGen/X86/sse-schedule.ll8
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-x86.ll28
-rw-r--r--test/CodeGen/X86/statepoint-call-lowering.ll1
-rw-r--r--test/CodeGen/X86/statepoint-gctransition-call-lowering.ll1
-rw-r--r--test/CodeGen/X86/statepoint-invoke.ll3
-rw-r--r--test/CodeGen/X86/throws-cfi-fp.ll98
-rw-r--r--test/CodeGen/X86/throws-cfi-no-fp.ll97
-rw-r--r--test/CodeGen/X86/var-permute-128.ll199
-rw-r--r--test/CodeGen/X86/var-permute-256.ll1020
-rw-r--r--test/CodeGen/X86/var-permute-512.ll618
-rw-r--r--test/CodeGen/X86/vec_fp_to_int.ll74
-rw-r--r--test/CodeGen/X86/vector-half-conversions.ll2620
-rw-r--r--test/CodeGen/X86/vector-sext.ll13
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v16.ll18
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v4.ll8
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v8.ll45
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v8.ll21
-rw-r--r--test/CodeGen/X86/vector-shuffle-avx512.ll2
-rw-r--r--test/CodeGen/X86/vector-shuffle-v1.ll2
-rw-r--r--test/CodeGen/X86/vector-trunc.ll73
-rw-r--r--test/CodeGen/X86/wide-integer-cmp.ll3
-rw-r--r--test/CodeGen/X86/x86-framelowering-trap.ll1
-rw-r--r--test/CodeGen/X86/x86-interleaved-access.ll1
-rw-r--r--test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll1
-rw-r--r--test/DebugInfo/AArch64/inlined-argument.ll140
-rw-r--r--test/DebugInfo/ARM/illegal-fragment.ll95
-rw-r--r--test/DebugInfo/ARM/salvage-debug-info.ll118
-rw-r--r--test/DebugInfo/Generic/location-verifier.ll2
-rw-r--r--test/DebugInfo/Generic/missing-abstract-variable.ll5
-rw-r--r--test/DebugInfo/Inputs/dwarfdump-header.elf-x86-64bin3056 -> 0 bytes
-rw-r--r--test/DebugInfo/X86/dwarfdump-header-64.s149
-rw-r--r--test/DebugInfo/X86/dwarfdump-header.s (renamed from test/DebugInfo/Inputs/dwarfdump-header.s)56
-rw-r--r--test/DebugInfo/X86/live-debug-variables.ll5
-rw-r--r--test/DebugInfo/dwarfdump-header.test60
-rw-r--r--test/FileCheck/defines.txt9
-rw-r--r--test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll6
-rw-r--r--test/LTO/Resolution/X86/comdat-mixed-lto.ll2
-rw-r--r--test/LTO/Resolution/X86/comdat.ll4
-rw-r--r--test/LTO/Resolution/X86/commons.ll2
-rw-r--r--test/MC/AArch64/SVE/assembler_tests/add.s66
-rw-r--r--test/MC/AArch64/SVE/assembler_tests/sub.s66
-rw-r--r--test/MC/AArch64/SVE/disassembler_tests/add.s50
-rw-r--r--test/MC/AArch64/SVE/disassembler_tests/sub.s50
-rw-r--r--test/MC/Disassembler/Mips/micromips32r3/valid-el.txt1
-rw-r--r--test/MC/Disassembler/Mips/micromips32r3/valid.txt1
-rw-r--r--test/MC/Disassembler/Mips/micromips32r6/valid.txt2
-rw-r--r--test/MC/Disassembler/Mips/micromips64r6/valid.txt2
-rw-r--r--test/MC/Disassembler/X86/prefixes-i386.txt78
-rw-r--r--test/MC/Disassembler/X86/prefixes-x86_64.txt24
-rw-r--r--test/MC/Disassembler/X86/prefixes.txt66
-rw-r--r--test/MC/Disassembler/X86/simple-tests.txt9
-rw-r--r--test/MC/Mips/micromips32r6/valid.s2
-rw-r--r--test/MC/Mips/micromips64r6/valid.s2
-rw-r--r--test/MC/Mips/tls-symbols.s28
-rw-r--r--test/Object/Inputs/trivial-object-test.coff-arm64bin0 -> 318 bytes
-rw-r--r--test/Object/Inputs/trivial-object-test.coff-armntbin0 -> 314 bytes
-rw-r--r--test/Object/archive-SYM64-write.test38
-rw-r--r--test/Object/obj2yaml.test158
-rw-r--r--test/Other/new-pm-defaults.ll1
-rw-r--r--test/Other/new-pm-lto-defaults.ll9
-rw-r--r--test/Other/new-pm-thinlto-defaults.ll1
-rw-r--r--test/ThinLTO/X86/deadstrip.ll30
-rw-r--r--test/ThinLTO/X86/funcimport2.ll4
-rw-r--r--test/ThinLTO/X86/internalize.ll9
-rw-r--r--test/ThinLTO/X86/lazyload_metadata.ll4
-rw-r--r--test/ThinLTO/X86/reference_non_importable.ll2
-rw-r--r--test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll339
-rw-r--r--test/Transforms/CallSiteSplitting/callsite-split.ll119
-rw-r--r--test/Transforms/CodeExtractor/PartialInlineNoInline.ll45
-rw-r--r--test/Transforms/CodeGenPrepare/ARM/sink-addrmode.ll18
-rw-r--r--test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll475
-rw-r--r--test/Transforms/ExpandMemCmp/X86/lit.local.cfg (renamed from test/LibDriver/lit.local.cfg)0
-rw-r--r--test/Transforms/ExpandMemCmp/X86/memcmp.ll (renamed from test/Transforms/CodeGenPrepare/X86/memcmp.ll)519
-rw-r--r--test/Transforms/IRCE/add-metadata-pre-post-loops.ll2
-rw-r--r--test/Transforms/IndVarSimplify/scev-phi-debug-info.ll71
-rw-r--r--test/Transforms/InstCombine/debuginfo_add.ll108
-rw-r--r--test/Transforms/InstCombine/shift.ll260
-rw-r--r--test/Transforms/LICM/sinking.ll284
-rw-r--r--test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll46
-rw-r--r--test/Transforms/LoopPredication/widened.ll138
-rw-r--r--test/Transforms/LoopVectorize/pr34681.ll122
-rw-r--r--test/Transforms/LoopVectorize/version-mem-access.ll5
-rw-r--r--test/Transforms/LowerTypeTests/blockaddress.ll27
-rw-r--r--test/Transforms/LowerTypeTests/import-unsat.ll1
-rw-r--r--test/Transforms/PGOProfile/Inputs/irreducible.proftext29
-rw-r--r--test/Transforms/PGOProfile/irreducible.ll184
-rw-r--r--test/Transforms/PGOProfile/thinlto_samplepgo_icp2.ll2
-rw-r--r--test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll48
-rw-r--r--test/Transforms/SLPVectorizer/X86/call.ll245
-rw-r--r--test/Transforms/SLPVectorizer/X86/cast.ll51
-rw-r--r--test/Transforms/SLPVectorizer/X86/load-merge.ll50
-rw-r--r--test/Transforms/SLPVectorizer/X86/stores_vectorize.ll84
-rw-r--r--test/Transforms/SampleProfile/indirect-call.ll2
-rw-r--r--test/Transforms/SimplifyCFG/merge-cond-stores-2.ll2
-rw-r--r--test/Transforms/WholeProgramDevirt/import-indir.ll1
-rw-r--r--test/lit.cfg.py4
-rw-r--r--test/lit.site.cfg.py.in3
-rw-r--r--test/tools/dsymutil/cmdline.test2
-rw-r--r--test/tools/gold/X86/asm_undefined2.ll3
-rw-r--r--test/tools/gold/X86/coff.ll2
-rw-r--r--test/tools/gold/X86/common.ll2
-rw-r--r--test/tools/gold/X86/emit-llvm.ll6
-rw-r--r--test/tools/gold/X86/global_with_section.ll16
-rw-r--r--test/tools/gold/X86/parallel.ll8
-rw-r--r--test/tools/gold/X86/thinlto_linkonceresolution.ll2
-rw-r--r--test/tools/gold/X86/thinlto_weak_library.ll2
-rw-r--r--test/tools/gold/X86/visibility.ll2
-rw-r--r--test/tools/llvm-ar/default-add.test3
-rw-r--r--test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s195
-rw-r--r--test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s380
-rw-r--r--test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s159
-rw-r--r--test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s87
-rw-r--r--test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s17
-rw-r--r--test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s17
-rw-r--r--test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s17
-rw-r--r--test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s5
-rw-r--r--test/tools/llvm-cfi-verify/X86/protected-lineinfo.s204
-rw-r--r--test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s168
-rw-r--r--test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s91
-rw-r--r--test/tools/llvm-lib/Inputs/a.s (renamed from test/LibDriver/Inputs/a.s)0
-rw-r--r--test/tools/llvm-lib/Inputs/b.s (renamed from test/LibDriver/Inputs/b.s)0
-rwxr-xr-xtest/tools/llvm-lib/Inputs/cl-gl.obj (renamed from test/LibDriver/Inputs/cl-gl.obj)bin3734 -> 3734 bytes
-rw-r--r--test/tools/llvm-lib/Inputs/resource.res (renamed from test/LibDriver/Inputs/resource.res)bin108 -> 108 bytes
-rw-r--r--test/tools/llvm-lib/infer-output-path.test (renamed from test/LibDriver/infer-output-path.test)0
-rw-r--r--test/tools/llvm-lib/invalid.test (renamed from test/LibDriver/invalid.test)0
-rw-r--r--test/tools/llvm-lib/libpath.test (renamed from test/LibDriver/libpath.test)0
-rw-r--r--test/tools/llvm-lib/lit.local.cfg3
-rw-r--r--test/tools/llvm-lib/no-inputs.test (renamed from test/LibDriver/no-inputs.test)0
-rw-r--r--test/tools/llvm-lib/resource.test (renamed from test/LibDriver/resource.test)0
-rw-r--r--test/tools/llvm-lib/thin.test (renamed from test/LibDriver/thin.test)0
-rw-r--r--test/tools/llvm-lib/use-paths.test (renamed from test/LibDriver/use-paths.test)0
-rw-r--r--test/tools/llvm-nm/X86/externalonly.test1
-rw-r--r--test/tools/llvm-nm/X86/importlibrary.test2
-rw-r--r--test/tools/llvm-objcopy/Inputs/dwarf.dwobin0 -> 3568 bytes
-rw-r--r--test/tools/llvm-objcopy/check-addr-offset-align-binary.test40
-rw-r--r--test/tools/llvm-objcopy/check-addr-offset-align.test67
-rw-r--r--test/tools/llvm-objcopy/drawf-fission.test43
-rw-r--r--test/tools/llvm-objdump/X86/Inputs/macho-invalid-reloc-section-indexbin0 -> 2768 bytes
-rw-r--r--test/tools/llvm-objdump/X86/malformed-machos.test3
-rw-r--r--tools/dsymutil/DwarfLinker.cpp6
-rw-r--r--tools/dsymutil/dsymutil.cpp13
-rw-r--r--tools/llvm-ar/CMakeLists.txt6
-rw-r--r--tools/llvm-ar/llvm-ar.cpp20
-rw-r--r--tools/llvm-cfi-verify/CMakeLists.txt2
-rw-r--r--tools/llvm-cfi-verify/LLVMBuild.txt2
-rw-r--r--tools/llvm-cfi-verify/lib/CMakeLists.txt4
-rw-r--r--tools/llvm-cfi-verify/lib/FileAnalysis.cpp49
-rw-r--r--tools/llvm-cfi-verify/lib/FileAnalysis.h9
-rw-r--r--tools/llvm-cfi-verify/lib/LLVMBuild.txt2
-rw-r--r--tools/llvm-cfi-verify/llvm-cfi-verify.cpp135
-rw-r--r--tools/llvm-cov/gcov.cpp2
-rw-r--r--tools/llvm-cvtres/llvm-cvtres.cpp2
-rw-r--r--tools/llvm-cxxdump/llvm-cxxdump.cpp15
-rw-r--r--tools/llvm-cxxfilt/CMakeLists.txt4
-rw-r--r--tools/llvm-dwp/CMakeLists.txt4
-rw-r--r--tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp6
-rw-r--r--tools/llvm-mcmarkup/llvm-mcmarkup.cpp13
-rw-r--r--tools/llvm-mt/llvm-mt.cpp4
-rw-r--r--tools/llvm-nm/CMakeLists.txt4
-rw-r--r--tools/llvm-nm/llvm-nm.cpp13
-rw-r--r--tools/llvm-objcopy/CMakeLists.txt4
-rw-r--r--tools/llvm-objcopy/Object.cpp20
-rw-r--r--tools/llvm-objcopy/Object.h1
-rw-r--r--tools/llvm-objcopy/llvm-objcopy.cpp81
-rw-r--r--tools/llvm-objdump/CMakeLists.txt4
-rw-r--r--tools/llvm-objdump/llvm-objdump.cpp30
-rw-r--r--tools/llvm-pdbutil/llvm-pdbutil.cpp25
-rw-r--r--tools/llvm-readobj/CMakeLists.txt4
-rw-r--r--tools/llvm-readobj/llvm-readobj.cpp15
-rw-r--r--tools/llvm-size/CMakeLists.txt4
-rw-r--r--tools/llvm-size/llvm-size.cpp15
-rw-r--r--tools/llvm-strings/CMakeLists.txt3
-rw-r--r--tools/llvm-symbolizer/CMakeLists.txt4
-rw-r--r--tools/llvm-xray/trie-node.h92
-rw-r--r--tools/llvm-xray/xray-converter.cc198
-rw-r--r--tools/llvm-xray/xray-converter.h7
-rw-r--r--tools/llvm-xray/xray-stacks.cc191
-rw-r--r--tools/opt/opt.cpp1
-rw-r--r--unittests/ADT/STLExtrasTest.cpp20
-rw-r--r--unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp119
-rw-r--r--unittests/CodeGen/LowLevelTypeTest.cpp78
-rw-r--r--unittests/CodeGen/MachineInstrTest.cpp4
-rw-r--r--unittests/DebugInfo/DWARF/DWARFFormValueTest.cpp6
-rw-r--r--unittests/IR/IRBuilderTest.cpp51
-rw-r--r--unittests/Support/DynamicLibrary/CMakeLists.txt2
-rw-r--r--unittests/Support/FileOutputBufferTest.cpp22
-rw-r--r--unittests/Support/SpecialCaseListTest.cpp24
-rw-r--r--unittests/tools/llvm-cfi-verify/CMakeLists.txt1
-rw-r--r--unittests/tools/llvm-cfi-verify/FileAnalysis.cpp54
-rw-r--r--unittests/tools/llvm-cfi-verify/GraphBuilder.cpp1
-rw-r--r--utils/FileCheck/FileCheck.cpp7
-rw-r--r--utils/TableGen/CMakeLists.txt1
-rw-r--r--utils/TableGen/CodeGenDAGPatterns.cpp5
-rw-r--r--utils/TableGen/DAGISelMatcherEmitter.cpp2
-rw-r--r--utils/TableGen/GlobalISelEmitter.cpp3
-rw-r--r--utils/TableGen/X86EVEX2VEXTablesEmitter.cpp8
-rwxr-xr-xutils/update_llc_test_checks.py18
-rwxr-xr-xutils/update_mir_test_checks.py3
848 files changed, 32598 insertions, 10117 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04565038311..216702eabaf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,26 +2,20 @@
cmake_minimum_required(VERSION 3.4.3)
-if(POLICY CMP0022)
- cmake_policy(SET CMP0022 NEW) # automatic when 2.8.12 is required
-endif()
+cmake_policy(SET CMP0022 NEW)
-if (POLICY CMP0051)
- # CMake 3.1 and higher include generator expressions of the form
- # $<TARGETLIB:obj> in the SOURCES property. These need to be
- # stripped everywhere that access the SOURCES property, so we just
- # defer to the OLD behavior of not including generator expressions
- # in the output for now.
- cmake_policy(SET CMP0051 OLD)
-endif()
+cmake_policy(SET CMP0048 NEW)
-if(POLICY CMP0056)
- cmake_policy(SET CMP0056 NEW)
-endif()
+# CMake 3.1 and higher include generator expressions of the form
+# $<TARGETLIB:obj> in the SOURCES property. These need to be
+# stripped everywhere that access the SOURCES property, so we just
+# defer to the OLD behavior of not including generator expressions
+# in the output for now.
+cmake_policy(SET CMP0051 OLD)
-if(POLICY CMP0057)
- cmake_policy(SET CMP0057 NEW)
-endif()
+cmake_policy(SET CMP0056 NEW)
+
+cmake_policy(SET CMP0057 NEW)
if(NOT DEFINED LLVM_VERSION_MAJOR)
set(LLVM_VERSION_MAJOR 6)
@@ -36,13 +30,6 @@ if(NOT DEFINED LLVM_VERSION_SUFFIX)
set(LLVM_VERSION_SUFFIX svn)
endif()
-if (POLICY CMP0048)
- cmake_policy(SET CMP0048 NEW)
- set(cmake_3_0_PROJ_VERSION
- VERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH})
- set(cmake_3_0_LANGUAGES LANGUAGES)
-endif()
-
if (NOT PACKAGE_VERSION)
set(PACKAGE_VERSION
"${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX}")
@@ -56,9 +43,8 @@ if ((CMAKE_GENERATOR MATCHES "Visual Studio") AND (CMAKE_GENERATOR_TOOLSET STREQ
endif()
project(LLVM
- ${cmake_3_0_PROJ_VERSION}
- ${cmake_3_0_LANGUAGES}
- C CXX ASM)
+ VERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}
+ LANGUAGES C CXX ASM)
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
message(STATUS "No build type selected, default to Debug")
@@ -193,6 +179,9 @@ set(CMAKE_MODULE_PATH
# for use by clang_complete, YouCompleteMe, etc.
set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
+option(LLVM_INSTALL_BINUTILS_SYMLINKS
+ "Install symlinks from the binutils tool names to the corresponding LLVM tools." OFF)
+
option(LLVM_INSTALL_UTILS "Include utility binaries in the 'install' target." OFF)
option(LLVM_INSTALL_TOOLCHAIN_ONLY "Only include toolchain files in the 'install' target." OFF)
@@ -766,6 +755,7 @@ configure_file(
add_custom_target(srpm
COMMAND cpack -G TGZ --config CPackSourceConfig.cmake -B ${LLVM_SRPM_DIR}/SOURCES
COMMAND rpmbuild -bs --define '_topdir ${LLVM_SRPM_DIR}' ${LLVM_SRPM_BINARY_SPECFILE})
+set_target_properties(srpm PROPERTIES FOLDER "Misc")
# They are not referenced. See set_output_directory().
@@ -978,6 +968,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
# Installing the headers needs to depend on generating any public
# tablegen'd headers.
add_custom_target(llvm-headers DEPENDS intrinsics_gen)
+ set_target_properties(llvm-headers PROPERTIES FOLDER "Misc")
if (NOT CMAKE_CONFIGURATION_TYPES)
add_custom_target(install-llvm-headers
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index a1b4846f19a..7f33591de0c 100644
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -157,7 +157,7 @@ if( NOT PURE_WINDOWS AND NOT LLVM_USE_SANITIZER MATCHES "Memory.*")
set(HAVE_TERMINFO 0)
endif()
- find_library(ICONV_LIBRARY_PATH NAMES iconv libiconv libiconv-2)
+ find_library(ICONV_LIBRARY_PATH NAMES iconv libiconv libiconv-2 c)
set(LLVM_LIBXML2_ENABLED 0)
set(LIBXML2_FOUND 0)
if((LLVM_ENABLE_LIBXML2) AND ((CMAKE_SYSTEM_NAME MATCHES "Linux") AND (ICONV_LIBRARY_PATH) OR APPLE))
diff --git a/cmake/modules/AddOCaml.cmake b/cmake/modules/AddOCaml.cmake
index 1d8094cc505..02bab684637 100644
--- a/cmake/modules/AddOCaml.cmake
+++ b/cmake/modules/AddOCaml.cmake
@@ -221,3 +221,4 @@ add_custom_target(ocaml_make_directory
COMMAND "${CMAKE_COMMAND}" "-E" "make_directory" "${LLVM_LIBRARY_DIR}/ocaml/llvm")
add_custom_target("ocaml_all")
set_target_properties(ocaml_all PROPERTIES FOLDER "Misc")
+set_target_properties(ocaml_make_directory PROPERTIES FOLDER "Misc")
diff --git a/cmake/modules/CrossCompile.cmake b/cmake/modules/CrossCompile.cmake
index b0726ebd4de..ff092b257ab 100644
--- a/cmake/modules/CrossCompile.cmake
+++ b/cmake/modules/CrossCompile.cmake
@@ -16,12 +16,15 @@ function(llvm_create_cross_target_internal target_name toolchain buildtype)
COMMAND ${CMAKE_COMMAND} -E make_directory ${LLVM_${target_name}_BUILD}
COMMENT "Creating ${LLVM_${target_name}_BUILD}...")
+ add_custom_target(CREATE_LLVM_${target_name}
+ DEPENDS ${LLVM_${target_name}_BUILD})
+
add_custom_command(OUTPUT ${LLVM_${target_name}_BUILD}/CMakeCache.txt
COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
${CROSS_TOOLCHAIN_FLAGS_${target_name}} ${CMAKE_SOURCE_DIR}
-DLLVM_TARGET_IS_CROSSCOMPILE_HOST=TRUE
WORKING_DIRECTORY ${LLVM_${target_name}_BUILD}
- DEPENDS ${LLVM_${target_name}_BUILD}
+ DEPENDS CREATE_LLVM_${target_name}
COMMENT "Configuring ${target_name} LLVM...")
add_custom_target(CONFIGURE_LLVM_${target_name}
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 03b96645249..cf1ece24848 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -232,6 +232,10 @@ if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 )
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -m32")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -m32")
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -m32")
+
+ # FIXME: CMAKE_SIZEOF_VOID_P is still 8
+ add_definitions(-D_LARGEFILE_SOURCE)
+ add_definitions(-D_FILE_OFFSET_BITS=64)
endif( LLVM_BUILD_32_BITS )
endif( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 )
@@ -242,6 +246,7 @@ if (ANDROID AND (ANDROID_NATIVE_API_LEVEL LESS 24))
set(LLVM_FORCE_SMALLFILE_FOR_ANDROID TRUE)
endif()
if( CMAKE_SIZEOF_VOID_P EQUAL 4 AND NOT LLVM_FORCE_SMALLFILE_FOR_ANDROID)
+ # FIXME: It isn't handled in LLVM_BUILD_32_BITS.
add_definitions( -D_LARGEFILE_SOURCE )
add_definitions( -D_FILE_OFFSET_BITS=64 )
endif()
diff --git a/cmake/modules/TableGen.cmake b/cmake/modules/TableGen.cmake
index 7f17f7016b6..95de53a547b 100644
--- a/cmake/modules/TableGen.cmake
+++ b/cmake/modules/TableGen.cmake
@@ -110,19 +110,6 @@ function(add_public_tablegen_target target)
set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} ${target} PARENT_SCOPE)
endfunction()
-if(LLVM_USE_HOST_TOOLS AND NOT TARGET NATIVE_LIB_LLVMTABLEGEN)
- llvm_ExternalProject_BuildCmd(tblgen_build_cmd LLVMSupport
- ${LLVM_NATIVE_BUILD}
- CONFIGURATION Release)
- add_custom_command(OUTPUT LIB_LLVMTABLEGEN
- COMMAND ${tblgen_build_cmd}
- DEPENDS CONFIGURE_LLVM_NATIVE
- WORKING_DIRECTORY ${LLVM_NATIVE_BUILD}
- COMMENT "Building libLLVMTableGen for native TableGen..."
- USES_TERMINAL)
- add_custom_target(NATIVE_LIB_LLVMTABLEGEN DEPENDS LIB_LLVMTABLEGEN)
-endif()
-
macro(add_tablegen target project)
set(${target}_OLD_LLVM_LINK_COMPONENTS ${LLVM_LINK_COMPONENTS})
set(LLVM_LINK_COMPONENTS ${LLVM_LINK_COMPONENTS} TableGen)
@@ -166,7 +153,7 @@ macro(add_tablegen target project)
CONFIGURATION Release)
add_custom_command(OUTPUT ${${project}_TABLEGEN_EXE}
COMMAND ${tblgen_build_cmd}
- DEPENDS ${target} NATIVE_LIB_LLVMTABLEGEN
+ DEPENDS CONFIGURE_LLVM_NATIVE ${target}
WORKING_DIRECTORY ${LLVM_NATIVE_BUILD}
COMMENT "Building native TableGen..."
USES_TERMINAL)
diff --git a/docs/CMake.rst b/docs/CMake.rst
index 473672b5f73..05edec64da3 100644
--- a/docs/CMake.rst
+++ b/docs/CMake.rst
@@ -224,6 +224,10 @@ LLVM-specific variables
Generate build targets for the LLVM tools. Defaults to ON. You can use this
option to disable the generation of build targets for the LLVM tools.
+**LLVM_INSTALL_BINUTILS_SYMLINKS**:BOOL
+ Install symlinks from the binutils tool names to the corresponding LLVM tools.
+ For example, ar will be symlinked to llvm-ar.
+
**LLVM_BUILD_EXAMPLES**:BOOL
Build LLVM examples. Defaults to OFF. Targets for building each example are
generated in any case. See documentation for *LLVM_BUILD_TOOLS* above for more
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index f1f93c7a228..0f2681e0cd8 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -3,7 +3,7 @@ if (DOXYGEN_FOUND)
if (LLVM_ENABLE_DOXYGEN)
set(abs_top_srcdir ${CMAKE_CURRENT_SOURCE_DIR})
set(abs_top_builddir ${CMAKE_CURRENT_BINARY_DIR})
-
+
if (HAVE_DOT)
set(DOT ${LLVM_PATH_DOT})
endif()
@@ -21,20 +21,20 @@ if (LLVM_ENABLE_DOXYGEN)
set(enable_external_search "NO")
set(extra_search_mappings "")
endif()
-
+
# If asked, configure doxygen for the creation of a Qt Compressed Help file.
option(LLVM_ENABLE_DOXYGEN_QT_HELP
"Generate a Qt Compressed Help file." OFF)
if (LLVM_ENABLE_DOXYGEN_QT_HELP)
set(LLVM_DOXYGEN_QCH_FILENAME "org.llvm.qch" CACHE STRING
"Filename of the Qt Compressed help file")
- set(LLVM_DOXYGEN_QHP_NAMESPACE "org.llvm" CACHE STRING
+ set(LLVM_DOXYGEN_QHP_NAMESPACE "org.llvm" CACHE STRING
"Namespace under which the intermediate Qt Help Project file lives")
set(LLVM_DOXYGEN_QHP_CUST_FILTER_NAME "${PACKAGE_STRING}" CACHE STRING
"See http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-filters")
set(LLVM_DOXYGEN_QHP_CUST_FILTER_ATTRS "${PACKAGE_NAME},${PACKAGE_VERSION}" CACHE STRING
"See http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes")
- find_program(LLVM_DOXYGEN_QHELPGENERATOR_PATH qhelpgenerator
+ find_program(LLVM_DOXYGEN_QHELPGENERATOR_PATH qhelpgenerator
DOC "Path to the qhelpgenerator binary")
if (NOT LLVM_DOXYGEN_QHELPGENERATOR_PATH)
message(FATAL_ERROR "Failed to find qhelpgenerator binary")
@@ -55,7 +55,7 @@ if (LLVM_ENABLE_DOXYGEN)
set(llvm_doxygen_qhp_cust_filter_name "")
set(llvm_doxygen_qhp_cust_filter_attrs "")
endif()
-
+
option(LLVM_DOXYGEN_SVG
"Use svg instead of png files for doxygen graphs." OFF)
if (LLVM_DOXYGEN_SVG)
@@ -113,6 +113,7 @@ if (LLVM_ENABLE_SPHINX)
if (${SPHINX_OUTPUT_MAN})
add_sphinx_target(man llvm)
add_sphinx_target(man llvm-dwarfdump)
+ add_sphinx_target(man dsymutil)
endif()
endif()
diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst
index 44cc57cebaf..9078f65e01c 100644
--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst
@@ -86,6 +86,11 @@ OPTIONS
All other variables get undefined after each encountered ``CHECK-LABEL``.
+.. option:: -D<VAR=VALUE>
+
+ Sets a filecheck variable ``VAR`` with value ``VALUE`` that can be used in
+ ``CHECK:`` lines.
+
.. option:: -version
Show the version number of this program.
diff --git a/docs/CommandGuide/dsymutil.rst b/docs/CommandGuide/dsymutil.rst
new file mode 100644
index 00000000000..a29bc3c295c
--- /dev/null
+++ b/docs/CommandGuide/dsymutil.rst
@@ -0,0 +1,89 @@
+dsymutil - manipulate archived DWARF debug symbol files
+=======================================================
+
+SYNOPSIS
+--------
+
+| :program:`dsymutil` [*options*] *executable*
+
+DESCRIPTION
+-----------
+
+:program:`dsymutil` links the DWARF debug information found in the object files
+for an executable *executable* by using debug symbols information contained in
+its symbol table. By default, the linked debug information is placed in a
+``.dSYM`` bundle with the same name as the executable.
+
+OPTIONS
+-------
+.. option:: --arch=<arch>
+
+ Link DWARF debug information only for specified CPU architecture types.
+ Architectures may be specified by name. When using this option, an error will
+ be returned if any architectures can not be properly linked. This option can
+ be specified multiple times, once for each desired architecture. All CPU
+ architectures will be linked by default and any architectures that can't be
+ properly linked will cause :program:`dsymutil` to return an error.
+
+.. option:: --dump-debug-map
+
+ Dump the *executable*'s debug-map (the list of the object files containing the
+ debug information) in YAML format and exit. Not DWARF link will take place.
+
+.. option:: -f, --flat
+
+ Produce a flat dSYM file. A ``.dwarf`` extension will be appended to the
+ executable name unless the output file is specified using the -o option.
+
+.. option:: --no-odr
+
+ Do not use ODR (One Definition Rule) for uniquing C++ types.
+
+.. option:: --no-output
+
+ Do the link in memory, but do not emit the result file.
+
+.. option:: --no-swiftmodule-timestamp
+
+ Don't check the timestamp for swiftmodule files.
+
+.. option:: -j <n>, --num-threads=<n>
+
+ Specifies the maximum number (``n``) of simultaneous threads to use when
+ linking multiple architectures.
+
+.. option:: -o <filename>
+
+ Specifies an alternate ``path`` to place the dSYM bundle. The default dSYM
+ bundle path is created by appending ``.dSYM`` to the executable name.
+
+.. option:: --oso-prepend-path=<path>
+
+ Specifies a ``path`` to prepend to all debug symbol object file paths.
+
+.. option:: -s, --symtab
+
+ Dumps the symbol table found in *executable* or object file(s) and exits.
+
+.. option:: -v, --verbose
+
+ Display verbose information when linking.
+
+.. option:: --version
+
+ Display the version of the tool.
+
+.. option:: -y
+
+ Treat *executable* as a YAML debug-map rather than an executable.
+
+EXIT STATUS
+-----------
+
+:program:`dsymutil` returns 0 if the DWARF debug information was linked
+successfully. Otherwise, it returns 1.
+
+SEE ALSO
+--------
+
+:manpage:`llvm-dwarfdump(1)`
diff --git a/docs/CommandGuide/index.rst b/docs/CommandGuide/index.rst
index 5a0a98ceb1f..805df00c173 100644
--- a/docs/CommandGuide/index.rst
+++ b/docs/CommandGuide/index.rst
@@ -30,6 +30,7 @@ Basic Commands
llvm-stress
llvm-symbolizer
llvm-dwarfdump
+ dsymutil
Debugging Tools
~~~~~~~~~~~~~~~
diff --git a/docs/CommandGuide/lli.rst b/docs/CommandGuide/lli.rst
index 9da13ee47e0..58481073d06 100644
--- a/docs/CommandGuide/lli.rst
+++ b/docs/CommandGuide/lli.rst
@@ -122,7 +122,7 @@ CODE GENERATION OPTIONS
Choose the code model from:
- .. code-block:: perl
+ .. code-block:: text
default: Target default code model
small: Small code model
@@ -154,7 +154,7 @@ CODE GENERATION OPTIONS
Instruction schedulers available (before register allocation):
- .. code-block:: perl
+ .. code-block:: text
=default: Best scheduler for the target
=none: No scheduling: breadth first sequencing
@@ -168,7 +168,7 @@ CODE GENERATION OPTIONS
Register allocator to use (default=linearscan)
- .. code-block:: perl
+ .. code-block:: text
=bigblock: Big-block register allocator
=linearscan: linear scan register allocator =local - local register allocator
@@ -178,7 +178,7 @@ CODE GENERATION OPTIONS
Choose relocation model from:
- .. code-block:: perl
+ .. code-block:: text
=default: Target default relocation model
=static: Non-relocatable code =pic - Fully relocatable, position independent code
@@ -188,7 +188,7 @@ CODE GENERATION OPTIONS
Spiller to use (default=local)
- .. code-block:: perl
+ .. code-block:: text
=simple: simple spiller
=local: local spiller
@@ -197,7 +197,7 @@ CODE GENERATION OPTIONS
Choose style of code to emit from X86 backend:
- .. code-block:: perl
+ .. code-block:: text
=att: Emit AT&T-style assembly
=intel: Emit Intel-style assembly
diff --git a/docs/CommandGuide/llvm-pdbutil.rst b/docs/CommandGuide/llvm-pdbutil.rst
index 8836f3a3eb4..29d487e0e74 100644
--- a/docs/CommandGuide/llvm-pdbutil.rst
+++ b/docs/CommandGuide/llvm-pdbutil.rst
@@ -142,7 +142,7 @@ Symbol Type Options
Displays class definitions in the specified format.
- .. code-block:: perl
+ .. code-block:: text
=all - Display all class members including data, constants, typedefs, functions, etc (default)
=layout - Only display members that contribute to class size.
@@ -152,7 +152,7 @@ Symbol Type Options
Displays classes in the specified order.
- .. code-block:: perl
+ .. code-block:: text
=none - Undefined / no particular sort order (default)
=name - Sort classes by name
@@ -200,7 +200,7 @@ Symbol Type Options
Type of symbols to dump when -globals, -externals, or -module-syms is
specified. (default all)
- .. code-block:: perl
+ .. code-block:: text
=thunks - Display thunk symbols
=data - Display data symbols
@@ -212,7 +212,7 @@ Symbol Type Options
For symbols dumped via the -module-syms, -globals, or -externals options, sort
the results in specified order.
- .. code-block:: perl
+ .. code-block:: text
=none - Undefined / no particular sort order
=name - Sort symbols by name
diff --git a/docs/GetElementPtr.rst b/docs/GetElementPtr.rst
index c2da640fe06..b593871695f 100644
--- a/docs/GetElementPtr.rst
+++ b/docs/GetElementPtr.rst
@@ -196,7 +196,7 @@ illegal.
In order to access the 18th integer in the array, you would need to do the
following:
-.. code-block:: llvm
+.. code-block:: text
%idx = getelementptr { [40 x i32]* }, { [40 x i32]* }* %, i64 0, i32 0
%arr = load [40 x i32]** %idx
diff --git a/docs/HowToCrossCompileBuiltinsOnArm.rst b/docs/HowToCrossCompileBuiltinsOnArm.rst
new file mode 100644
index 00000000000..4b4d563a5a9
--- /dev/null
+++ b/docs/HowToCrossCompileBuiltinsOnArm.rst
@@ -0,0 +1,201 @@
+===================================================================
+How to Cross Compile Compiler-rt Builtins For Arm
+===================================================================
+
+Introduction
+============
+
+This document contains information about building and testing the builtins part
+of compiler-rt for an Arm target, from an x86_64 Linux machine.
+
+While this document concentrates on Arm and Linux the general principles should
+apply to other targets supported by compiler-rt. Further contributions for other
+targets are welcome.
+
+The instructions in this document depend on libraries and programs external to
+LLVM, there are many ways to install and configure these dependencies so you
+may need to adapt the instructions here to fit your own local situation.
+
+Prerequisites
+=============
+
+In this use case we'll be using CMake on a Debian-based Linux system,
+cross-compiling from an x86_64 host to a hard-float Armv7-A target. We'll be
+using as many of the LLVM tools as we can, but it is possible to use GNU
+equivalents.
+
+ * ``A build of LLVM/clang for the llvm-tools and llvm-config``
+ * ``The qemu-arm user mode emulator``
+ * ``An arm-linux-gnueabihf sysroot``
+
+See https://compiler-rt.llvm.org/ for more information about the dependencies
+on clang and LLVM.
+
+``qemu-arm`` should be available as a package for your Linux distribution.
+
+The most complicated of the prequisites to satisfy is the arm-linux-gnueabihf
+sysroot. The :doc:`HowToCrossCompileLLVM` has information about how to use the
+Linux distributions multiarch support to fulfill the dependencies for building
+LLVM. Alternatively, as building and testing just the compiler-rt builtins
+requires fewer dependencies than LLVM, it is possible to use the Linaro
+arm-linux-gnueabihf gcc installation as our sysroot.
+
+Building compiler-rt builtins for Arm
+=====================================
+We will be doing a standalone build of compiler-rt using the following cmake
+options.
+
+* ``path/to/llvm/projects/compiler-rt``
+* ``-DCOMPILER_RT_BUILD_BUILTINS=ON``
+* ``-DCOMPILER_RT_BUILD_SANITIZERS=OFF``
+* ``-DCOMPILER_RT_BUILD_XRAY=OFF``
+* ``-DCOMPILER_RT_BUILD_LIBFUZZER=OFF``
+* ``-DCOMPILER_RT_BUILD_PROFILE=OFF``
+* ``-DCMAKE_C_COMPILER=/path/to/clang``
+* ``-DCMAKE_AR=/path/to/llvm-ar``
+* ``-DCMAKE_NM=/path/to/llvm-nm``
+* ``-DCMAKE_RANLIB=/path/to/llvm-ranlib``
+* ``-DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=lld"``
+* ``-DCMAKE_C_COMPILER_TARGET="arm-linux-gnueabihf"``
+* ``-DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON``
+* ``-DLLVM_CONFIG_PATH=/path/to/llvm-config``
+* ``-DCMAKE_C_FLAGS="build-c-flags"``
+
+The build-c-flags need to be sufficient to pass the C-make compiler check and
+to compile compiler-rt. When using a GCC 7 Linaro arm-linux-gnueabihf
+installation the following flags are needed:
+
+* ``--target=arm-linux-gnueabihf``
+* ``--march=armv7a``
+* ``--gcc-toolchain=/path/to/dir/toolchain``
+* ``--sysroot=/path/to/toolchain/arm-linux-gnueabihf/libc``
+
+Depending on how your sysroot is laid out, you may not need ``--gcc-toolchain``.
+For example if you have added armhf as an architecture using your Linux
+distributions multiarch support then you should be able to use ``--sysroot=/``.
+
+Once cmake has completed the builtins can be built with ``ninja builtins``
+
+Testing compiler-rt builtins using qemu-arm
+===========================================
+To test the builtins library we need to add a few more cmake flags to enable
+testing and set up the compiler and flags for test case. We must also tell
+cmake that we wish to run the tests on ``qemu-arm``.
+
+* ``-DCOMPILER_RT_EMULATOR="qemu-arm -L /path/to/armhf/sysroot``
+* ``-DCOMPILER_RT_INCLUDE_TESTS=ON``
+* ``-DCOMPILER_RT_TEST_COMPILER="/path/to/clang"``
+* ``-DCOMPILER_RT_TEST_COMPILER_CFLAGS="test-c-flags"``
+
+The ``/path/to/armhf/sysroot`` should be the same as the one passed to
+``--sysroot`` in the "build-c-flags".
+
+The "test-c-flags" can be the same as the "build-c-flags", with the addition
+of ``"-fuse-ld=lld`` if you wish to use lld to link the tests.
+
+Once cmake has completed the tests can be built and run using
+``ninja check-builtins``
+
+Modifications for other Targets
+===============================
+
+Arm Soft-Float Target
+---------------------
+The instructions for the Arm hard-float target can be used for the soft-float
+target by substituting soft-float equivalents for the sysroot and target. The
+target to use is:
+
+* ``-DCMAKE_C_COMPILER_TARGET=arm-linux-gnueabi``
+
+Depending on whether you want to use floating point instructions or not you
+may need extra c-flags such as ``-mfloat-abi=softfp`` for use of floating-point
+instructions, and ``-mfloat-abi=soft -mfpu=none`` for software floating-point
+emulation.
+
+AArch64 Target
+--------------
+The instructions for Arm can be used for AArch64 by substituting AArch64
+equivalents for the sysroot, emulator and target.
+
+* ``-DCMAKE_C_COMPILER_TARGET=aarch64-linux-gnu``
+* ``-DCOMPILER_RT_EMULATOR="qemu-aarch64 -L /path/to/aarch64/sysroot``
+
+The CMAKE_C_FLAGS and COMPILER_RT_TEST_COMPILER_CFLAGS may also need:
+``"--sysroot=/path/to/aarch64/sysroot --gcc-toolchain=/path/to/gcc-toolchain"``
+
+Armv6-m, Armv7-m and Armv7E-M targets
+-------------------------------------
+If you wish to build, but not test compiler-rt for Armv6-M, Armv7-M or Armv7E-M
+then the easiest way is to use the BaremetalARM.cmake recipe in
+clang/cmake/caches.
+
+You will need a bare metal sysroot such as that provided by the GNU ARM
+Embedded toolchain.
+
+The libraries can be built with the cmake options:
+
+* ``-DBAREMETAL_ARMV6M_SYSROOT=/path/to/bare/metal/sysroot``
+* ``-DBAREMETAL_ARMV7M_SYSROOT=/path/to/bare/metal/sysroot``
+* ``-DBAREMETAL_ARMV7EM_SYSROOT=/path/to/bare/metal/sysroot``
+* ``-C /path/to/llvm/source/tools/clang/cmake/caches/BaremetalARM.cmake``
+
+**Note** that for the recipe to work the compiler-rt source must be checked out
+into the directory llvm/runtimes and not llvm/projects.
+
+To build and test the libraries using a similar method to Armv7-A is possible
+but more difficult. The main problems are:
+
+* There isn't a ``qemu-arm`` user-mode emulator for bare-metal systems. The ``qemu-system-arm`` can be used but this is significantly more difficult to setup.
+* The target to compile compiler-rt have the suffix -none-eabi. This uses the BareMetal driver in clang and by default won't find the libraries needed to pass the cmake compiler check.
+
+As the Armv6-M, Armv7-M and Armv7E-M builds of compiler-rt only use instructions
+that are supported on Armv7-A we can still get most of the value of running the
+tests using the same ``qemu-arm`` that we used for Armv7-A by building and
+running the test cases for Armv7-A but using the builtins compiled for
+Armv6-M, Armv7-M or Armv7E-M. This will not catch instructions that are
+supported on Armv7-A but not Armv6-M, Armv7-M and Armv7E-M.
+
+To get the cmake compile test to pass the libraries needed to successfully link
+the test application will need to be manually added to ``CMAKE_CFLAGS``.
+Alternatively if you are using version 3.6 or above of cmake you can use
+``CMAKE_TRY_COMPILE_TARGET=STATIC_LIBRARY`` to skip the link step.
+
+* ``-DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY``
+* ``-DCOMPILER_RT_OS_DIR="baremetal"``
+* ``-DCOMPILER_RT_BUILD_BUILTINS=ON``
+* ``-DCOMPILER_RT_BUILD_SANITIZERS=OFF``
+* ``-DCOMPILER_RT_BUILD_XRAY=OFF``
+* ``-DCOMPILER_RT_BUILD_LIBFUZZER=OFF``
+* ``-DCOMPILER_RT_BUILD_PROFILE=OFF``
+* ``-DCMAKE_C_COMPILER=${host_install_dir}/bin/clang``
+* ``-DCMAKE_C_COMPILER_TARGET="your *-none-eabi target"``
+* ``-DCMAKE_AR=/path/to/llvm-ar``
+* ``-DCMAKE_NM=/path/to/llvm-nm``
+* ``-DCMAKE_RANLIB=/path/to/llvm-ranlib``
+* ``-DCOMPILER_RT_BAREMETAL_BUILD=ON``
+* ``-DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON``
+* ``-DLLVM_CONFIG_PATH=/path/to/llvm-config``
+* ``-DCMAKE_C_FLAGS="build-c-flags"``
+* ``-DCMAKE_ASM_FLAGS="${arm_cflags}"``
+* ``-DCOMPILER_RT_EMULATOR="qemu-arm -L /path/to/armv7-A/sysroot"``
+* ``-DCOMPILER_RT_INCLUDE_TESTS=ON``
+* ``-DCOMPILER_RT_TEST_COMPILER="/path/to/clang"``
+* ``-DCOMPILER_RT_TEST_COMPILER_CFLAGS="test-c-flags"``
+
+The Armv6-M builtins will use the soft-float ABI. When compiling the tests for
+Armv7-A we must include ``"-mthumb -mfloat-abi=soft -mfpu=none"`` in the
+test-c-flags. We must use an Armv7-A soft-float abi sysroot for ``qemu-arm``.
+
+Unfortunately at time of writing the Armv7-M and Armv7E-M builds of
+compiler-rt will always include assembler files including floating point
+instructions. This means that building for a cpu without a floating point unit
+requires something like removing the arm_Thumb1_VFPv2_SOURCES from the
+arm_Thumb1_SOURCES in builtins/CMakeLists.txt. The float-abi of the compiler-rt
+library must be matched by the float abi of the Armv7-A sysroot used by
+qemu-arm.
+
+Depending on the linker used for the test cases you may encounter BuildAttribute
+mismatches between the M-profile objects from compiler-rt and the A-profile
+objects from the test. The lld linker does not check the BuildAttributes so it
+can be used to link the tests by adding -fuse-ld=lld to the
+``COMPILER_RT_TEST_COMPILER_CFLAGS``.
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 9d910568bd5..8cbed7d87d1 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -542,7 +542,7 @@ symbol is assumed to be ``dso_preemptable``.
``dso_local``
The compiler may assume that a function or variable marked as ``dso_local``
- will resolve to a symbol within the same linkage unit. Direct access will
+ will resolve to a symbol within the same linkage unit. Direct access will
be generated even if the definition is not within this compilation unit.
.. _namedtypes:
@@ -597,9 +597,9 @@ Global variables in other translation units can also be declared, in which
case they don't have an initializer.
Either global variable definitions or declarations may have an explicit section
-to be placed in and may have an optional explicit alignment specified. If there
-is a mismatch between the explicit or inferred section information for the
-variable declaration and its definition the resulting behavior is undefined.
+to be placed in and may have an optional explicit alignment specified. If there
+is a mismatch between the explicit or inferred section information for the
+variable declaration and its definition the resulting behavior is undefined.
A variable may be defined as a global ``constant``, which indicates that
the contents of the variable will **never** be modified (enabling better
@@ -642,11 +642,11 @@ target supports it, it will emit globals to the section specified.
Additionally, the global can placed in a comdat if the target has the necessary
support.
-External declarations may have an explicit section specified. Section
-information is retained in LLVM IR for targets that make use of this
-information. Attaching section information to an external declaration is an
-assertion that its definition is located in the specified section. If the
-definition is located in a different section, the behavior is undefined.
+External declarations may have an explicit section specified. Section
+information is retained in LLVM IR for targets that make use of this
+information. Attaching section information to an external declaration is an
+assertion that its definition is located in the specified section. If the
+definition is located in a different section, the behavior is undefined.
By default, global initializers are optimized by assuming that global
variables defined within the module are not modified from their
@@ -2272,11 +2272,11 @@ seq\_cst total orderings of other operations that are not marked
Fast-Math Flags
---------------
-LLVM IR floating-point binary ops (:ref:`fadd <i_fadd>`,
+LLVM IR floating-point operations (:ref:`fadd <i_fadd>`,
:ref:`fsub <i_fsub>`, :ref:`fmul <i_fmul>`, :ref:`fdiv <i_fdiv>`,
:ref:`frem <i_frem>`, :ref:`fcmp <i_fcmp>`) and :ref:`call <i_call>`
-instructions have the following flags that can be set to enable
-otherwise unsafe floating point transformations.
+may use the following flags to enable otherwise unsafe
+floating-point transformations.
``nnan``
No NaNs - Allow optimizations to assume the arguments and result are not
@@ -2300,10 +2300,17 @@ otherwise unsafe floating point transformations.
Allow floating-point contraction (e.g. fusing a multiply followed by an
addition into a fused multiply-and-add).
+``afn``
+ Approximate functions - Allow substitution of approximate calculations for
+ functions (sin, log, sqrt, etc). See floating-point intrinsic definitions
+ for places where this can apply to LLVM's intrinsic math functions.
+
+``reassoc``
+ Allow reassociation transformations for floating-point instructions.
+ This may dramatically change results in floating point.
+
``fast``
- Fast - Allow algebraically equivalent transformations that may
- dramatically change results in floating point (e.g. reassociate). This
- flag implies all the others.
+ This flag implies all of the others.
.. _uselistorder:
@@ -4499,7 +4506,7 @@ source variable. DIExpressions also follow this model: A DIExpression that
doesn't have a trailing ``DW_OP_stack_value`` will describe an *address* when
combined with a concrete location.
-.. code-block:: llvm
+.. code-block:: text
!0 = !DIExpression(DW_OP_deref)
!1 = !DIExpression(DW_OP_plus_uconst, 3)
@@ -4639,13 +4646,13 @@ As a concrete example, the type descriptor graph for the following program
int i; // offset 0
float f; // offset 4
};
-
+
struct Outer {
float f; // offset 0
double d; // offset 4
struct Inner inner_a; // offset 12
};
-
+
void f(struct Outer* outer, struct Inner* inner, float* f, int* i, char* c) {
outer->f = 0; // tag0: (OuterStructTy, FloatScalarTy, 0)
outer->inner_a.i = 0; // tag1: (OuterStructTy, IntScalarTy, 12)
@@ -5194,14 +5201,37 @@ the loop identifier metadata node directly:
!1 = !{!1} ; an identifier for the inner loop
!2 = !{!2} ; an identifier for the outer loop
+'``irr_loop``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^
+
+``irr_loop`` metadata may be attached to the terminator instruction of a basic
+block that's an irreducible loop header (note that an irreducible loop has more
+than once header basic blocks.) If ``irr_loop`` metadata is attached to the
+terminator instruction of a basic block that is not really an irreducible loop
+header, the behavior is undefined. The intent of this metadata is to improve the
+accuracy of the block frequency propagation. For example, in the code below, the
+block ``header0`` may have a loop header weight (relative to the other headers of
+the irreducible loop) of 100:
+
+.. code-block:: llvm
+
+ header0:
+ ...
+ br i1 %cmp, label %t1, label %t2, !irr_loop !0
+
+ ...
+ !0 = !{"loop_header_weight", i64 100}
+
+Irreducible loop header weights are typically based on profile data.
+
'``invariant.group``' Metadata
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The ``invariant.group`` metadata may be attached to ``load``/``store`` instructions.
-The existence of the ``invariant.group`` metadata on the instruction tells
-the optimizer that every ``load`` and ``store`` to the same pointer operand
-within the same invariant group can be assumed to load or store the same
-value (but see the ``llvm.invariant.group.barrier`` intrinsic which affects
+The existence of the ``invariant.group`` metadata on the instruction tells
+the optimizer that every ``load`` and ``store`` to the same pointer operand
+within the same invariant group can be assumed to load or store the same
+value (but see the ``llvm.invariant.group.barrier`` intrinsic which affects
when two pointers are considered the same). Pointers returned by bitcast or
getelementptr with only zero indices are considered the same.
@@ -5214,26 +5244,26 @@ Examples:
%ptr = alloca i8
store i8 42, i8* %ptr, !invariant.group !0
call void @foo(i8* %ptr)
-
+
%a = load i8, i8* %ptr, !invariant.group !0 ; Can assume that value under %ptr didn't change
call void @foo(i8* %ptr)
%b = load i8, i8* %ptr, !invariant.group !1 ; Can't assume anything, because group changed
-
- %newPtr = call i8* @getPointer(i8* %ptr)
+
+ %newPtr = call i8* @getPointer(i8* %ptr)
%c = load i8, i8* %newPtr, !invariant.group !0 ; Can't assume anything, because we only have information about %ptr
-
+
%unknownValue = load i8, i8* @unknownPtr
store i8 %unknownValue, i8* %ptr, !invariant.group !0 ; Can assume that %unknownValue == 42
-
+
call void @foo(i8* %ptr)
%newPtr2 = call i8* @llvm.invariant.group.barrier(i8* %ptr)
%d = load i8, i8* %newPtr2, !invariant.group !0 ; Can't step through invariant.group.barrier to get value of %ptr
-
+
...
declare void @foo(i8*)
declare i8* @getPointer(i8*)
declare i8* @llvm.invariant.group.barrier(i8*)
-
+
!0 = !{!"magic ptr"}
!1 = !{!"other ptr"}
@@ -5242,7 +5272,7 @@ another based on aliasing information. This is because invariant.group is tied
to the SSA value of the pointer operand.
.. code-block:: llvm
-
+
%v = load i8, i8* %x, !invariant.group !0
; if %x mustalias %y then we can replace the above instruction with
%v = load i8, i8* %y
@@ -5272,7 +5302,7 @@ It does not have any effect on non-ELF targets.
Example:
-.. code-block:: llvm
+.. code-block:: text
$a = comdat any
@a = global i32 1, comdat $a
@@ -6700,9 +6730,9 @@ remainder.
Note that unsigned integer remainder and signed integer remainder are
distinct operations; for signed integer remainder, use '``srem``'.
-
+
Taking the remainder of a division by zero is undefined behavior.
-For vectors, if any element of the divisor is zero, the operation has
+For vectors, if any element of the divisor is zero, the operation has
undefined behavior.
Example:
@@ -6754,7 +6784,7 @@ Note that signed integer remainder and unsigned integer remainder are
distinct operations; for unsigned integer remainder, use '``urem``'.
Taking the remainder of a division by zero is undefined behavior.
-For vectors, if any element of the divisor is zero, the operation has
+For vectors, if any element of the divisor is zero, the operation has
undefined behavior.
Overflow also leads to undefined behavior; this is a rare case, but can
occur, for example, by taking the remainder of a 32-bit division of
@@ -7627,7 +7657,7 @@ be reused in the cache. The code generator may select special
instructions to save cache bandwidth, such as the ``MOVNT`` instruction on
x86.
-The optional ``!invariant.group`` metadata must reference a
+The optional ``!invariant.group`` metadata must reference a
single metadata name ``<index>``. See ``invariant.group`` metadata.
Semantics:
@@ -7701,7 +7731,7 @@ A ``fence`` instruction can also take an optional
Example:
""""""""
-.. code-block:: llvm
+.. code-block:: text
fence acquire ; yields void
fence syncscope("singlethread") seq_cst ; yields void
@@ -7733,10 +7763,10 @@ There are three arguments to the '``cmpxchg``' instruction: an address
to operate on, a value to compare to the value currently be at that
address, and a new value to place at that address if the compared values
are equal. The type of '<cmp>' must be an integer or pointer type whose
-bit width is a power of two greater than or equal to eight and less
+bit width is a power of two greater than or equal to eight and less
than or equal to a target-specific size limit. '<cmp>' and '<new>' must
-have the same type, and the type of '<pointer>' must be a pointer to
-that type. If the ``cmpxchg`` is marked as ``volatile``, then the
+have the same type, and the type of '<pointer>' must be a pointer to
+that type. If the ``cmpxchg`` is marked as ``volatile``, then the
optimizer is not allowed to modify the number or order of execution of
this ``cmpxchg`` with other :ref:`volatile operations <volatile>`.
@@ -9030,7 +9060,7 @@ This instruction requires several arguments:
``tail`` or ``musttail`` markers to the call. It is used to prevent tail
call optimization from being performed on the call.
-#. The optional ``fast-math flags`` marker indicates that the call has one or more
+#. The optional ``fast-math flags`` marker indicates that the call has one or more
:ref:`fast-math flags <fastmath>`, which are optimization hints to enable
otherwise unsafe floating-point optimizations. Fast-math flags are only valid
for calls that return a floating-point scalar or vector type.
@@ -10460,7 +10490,7 @@ Syntax:
"""""""
This is an overloaded intrinsic. You can use ``llvm.sqrt`` on any
-floating point or vector of floating point type. Not all targets support
+floating-point or vector of floating-point type. Not all targets support
all types however.
::
@@ -10474,20 +10504,22 @@ all types however.
Overview:
"""""""""
-The '``llvm.sqrt``' intrinsics return the square root of the specified value,
-returning the same value as the libm '``sqrt``' functions would, but without
-trapping or setting ``errno``.
+The '``llvm.sqrt``' intrinsics return the square root of the specified value.
Arguments:
""""""""""
-The argument and return value are floating point numbers of the same type.
+The argument and return value are floating-point numbers of the same type.
Semantics:
""""""""""
-This function returns the square root of the operand if it is a nonnegative
-floating point number.
+Return the same value as a corresponding libm '``sqrt``' function but without
+trapping or setting ``errno``. For types specified by IEEE-754, the result
+matches a conforming libm implementation.
+
+When specified with the fast-math-flag 'afn', the result may be approximated
+using a less accurate calculation.
'``llvm.powi.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -10534,7 +10566,7 @@ Syntax:
"""""""
This is an overloaded intrinsic. You can use ``llvm.sin`` on any
-floating point or vector of floating point type. Not all targets support
+floating-point or vector of floating-point type. Not all targets support
all types however.
::
@@ -10553,14 +10585,16 @@ The '``llvm.sin.*``' intrinsics return the sine of the operand.
Arguments:
""""""""""
-The argument and return value are floating point numbers of the same type.
+The argument and return value are floating-point numbers of the same type.
Semantics:
""""""""""
-This function returns the sine of the specified operand, returning the
-same values as the libm ``sin`` functions would, and handles error
-conditions in the same way.
+Return the same value as a corresponding libm '``sin``' function but without
+trapping or setting ``errno``.
+
+When specified with the fast-math-flag 'afn', the result may be approximated
+using a less accurate calculation.
'``llvm.cos.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -10569,7 +10603,7 @@ Syntax:
"""""""
This is an overloaded intrinsic. You can use ``llvm.cos`` on any
-floating point or vector of floating point type. Not all targets support
+floating-point or vector of floating-point type. Not all targets support
all types however.
::
@@ -10588,14 +10622,16 @@ The '``llvm.cos.*``' intrinsics return the cosine of the operand.
Arguments:
""""""""""
-The argument and return value are floating point numbers of the same type.
+The argument and return value are floating-point numbers of the same type.
Semantics:
""""""""""
-This function returns the cosine of the specified operand, returning the
-same values as the libm ``cos`` functions would, and handles error
-conditions in the same way.
+Return the same value as a corresponding libm '``cos``' function but without
+trapping or setting ``errno``.
+
+When specified with the fast-math-flag 'afn', the result may be approximated
+using a less accurate calculation.
'``llvm.pow.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -10604,7 +10640,7 @@ Syntax:
"""""""
This is an overloaded intrinsic. You can use ``llvm.pow`` on any
-floating point or vector of floating point type. Not all targets support
+floating-point or vector of floating-point type. Not all targets support
all types however.
::
@@ -10624,15 +10660,16 @@ specified (positive or negative) power.
Arguments:
""""""""""
-The second argument is a floating point power, and the first is a value
-to raise to that power.
+The arguments and return value are floating-point numbers of the same type.
Semantics:
""""""""""
-This function returns the first value raised to the second power,
-returning the same values as the libm ``pow`` functions would, and
-handles error conditions in the same way.
+Return the same value as a corresponding libm '``pow``' function but without
+trapping or setting ``errno``.
+
+When specified with the fast-math-flag 'afn', the result may be approximated
+using a less accurate calculation.
'``llvm.exp.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -10641,7 +10678,7 @@ Syntax:
"""""""
This is an overloaded intrinsic. You can use ``llvm.exp`` on any
-floating point or vector of floating point type. Not all targets support
+floating-point or vector of floating-point type. Not all targets support
all types however.
::
@@ -10661,13 +10698,16 @@ value.
Arguments:
""""""""""
-The argument and return value are floating point numbers of the same type.
+The argument and return value are floating-point numbers of the same type.
Semantics:
""""""""""
-This function returns the same values as the libm ``exp`` functions
-would, and handles error conditions in the same way.
+Return the same value as a corresponding libm '``exp``' function but without
+trapping or setting ``errno``.
+
+When specified with the fast-math-flag 'afn', the result may be approximated
+using a less accurate calculation.
'``llvm.exp2.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -10676,7 +10716,7 @@ Syntax:
"""""""
This is an overloaded intrinsic. You can use ``llvm.exp2`` on any
-floating point or vector of floating point type. Not all targets support
+floating-point or vector of floating-point type. Not all targets support
all types however.
::
@@ -10696,13 +10736,16 @@ specified value.
Arguments:
""""""""""
-The argument and return value are floating point numbers of the same type.
+The argument and return value are floating-point numbers of the same type.
Semantics:
""""""""""
-This function returns the same values as the libm ``exp2`` functions
-would, and handles error conditions in the same way.
+Return the same value as a corresponding libm '``exp2``' function but without
+trapping or setting ``errno``.
+
+When specified with the fast-math-flag 'afn', the result may be approximated
+using a less accurate calculation.
'``llvm.log.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -10711,7 +10754,7 @@ Syntax:
"""""""
This is an overloaded intrinsic. You can use ``llvm.log`` on any
-floating point or vector of floating point type. Not all targets support
+floating-point or vector of floating-point type. Not all targets support
all types however.
::
@@ -10731,13 +10774,16 @@ value.
Arguments:
""""""""""
-The argument and return value are floating point numbers of the same type.
+The argument and return value are floating-point numbers of the same type.
Semantics:
""""""""""
-This function returns the same values as the libm ``log`` functions
-would, and handles error conditions in the same way.
+Return the same value as a corresponding libm '``log``' function but without
+trapping or setting ``errno``.
+
+When specified with the fast-math-flag 'afn', the result may be approximated
+using a less accurate calculation.
'``llvm.log10.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -10746,7 +10792,7 @@ Syntax:
"""""""
This is an overloaded intrinsic. You can use ``llvm.log10`` on any
-floating point or vector of floating point type. Not all targets support
+floating-point or vector of floating-point type. Not all targets support
all types however.
::
@@ -10766,13 +10812,16 @@ specified value.
Arguments:
""""""""""
-The argument and return value are floating point numbers of the same type.
+The argument and return value are floating-point numbers of the same type.
Semantics:
""""""""""
-This function returns the same values as the libm ``log10`` functions
-would, and handles error conditions in the same way.
+Return the same value as a corresponding libm '``log10``' function but without
+trapping or setting ``errno``.
+
+When specified with the fast-math-flag 'afn', the result may be approximated
+using a less accurate calculation.
'``llvm.log2.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -10781,7 +10830,7 @@ Syntax:
"""""""
This is an overloaded intrinsic. You can use ``llvm.log2`` on any
-floating point or vector of floating point type. Not all targets support
+floating-point or vector of floating-point type. Not all targets support
all types however.
::
@@ -10801,13 +10850,16 @@ value.
Arguments:
""""""""""
-The argument and return value are floating point numbers of the same type.
+The argument and return value are floating-point numbers of the same type.
Semantics:
""""""""""
-This function returns the same values as the libm ``log2`` functions
-would, and handles error conditions in the same way.
+Return the same value as a corresponding libm '``log2``' function but without
+trapping or setting ``errno``.
+
+When specified with the fast-math-flag 'afn', the result may be approximated
+using a less accurate calculation.
'``llvm.fma.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -10816,7 +10868,7 @@ Syntax:
"""""""
This is an overloaded intrinsic. You can use ``llvm.fma`` on any
-floating point or vector of floating point type. Not all targets support
+floating-point or vector of floating-point type. Not all targets support
all types however.
::
@@ -10830,20 +10882,21 @@ all types however.
Overview:
"""""""""
-The '``llvm.fma.*``' intrinsics perform the fused multiply-add
-operation.
+The '``llvm.fma.*``' intrinsics perform the fused multiply-add operation.
Arguments:
""""""""""
-The argument and return value are floating point numbers of the same
-type.
+The arguments and return value are floating-point numbers of the same type.
Semantics:
""""""""""
-This function returns the same values as the libm ``fma`` functions
-would, and does not set errno.
+Return the same value as a corresponding libm '``fma``' function but without
+trapping or setting ``errno``.
+
+When specified with the fast-math-flag 'afn', the result may be approximated
+using a less accurate calculation.
'``llvm.fabs.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -12772,7 +12825,7 @@ Syntax:
Overview:
"""""""""
-The '``llvm.invariant.group.barrier``' intrinsic can be used when an invariant
+The '``llvm.invariant.group.barrier``' intrinsic can be used when an invariant
established by invariant.group metadata no longer holds, to obtain a new pointer
value that does not carry the invariant information.
@@ -12786,7 +12839,7 @@ the pointer to the memory for which the ``invariant.group`` no longer holds.
Semantics:
""""""""""
-Returns another pointer that aliases its argument but which is considered different
+Returns another pointer that aliases its argument but which is considered different
for the purposes of ``load``/``store`` ``invariant.group`` metadata.
Constrained Floating Point Intrinsics
@@ -12864,7 +12917,7 @@ strictly preserve the floating point exception semantics of the original code.
Any FP exception that would have been raised by the original code must be raised
by the transformed code, and the transformed code must not raise any FP
exceptions that would not have been raised by the original code. This is the
-exception behavior argument that will be used if the code being compiled reads
+exception behavior argument that will be used if the code being compiled reads
the FP exception status flags, but this mode can also be used with code that
unmasks FP exceptions.
@@ -12882,7 +12935,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.fadd(<type> <op1>, <type> <op2>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -12919,7 +12972,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.fsub(<type> <op1>, <type> <op2>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -12956,7 +13009,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.fmul(<type> <op1>, <type> <op2>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -12993,7 +13046,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.fdiv(<type> <op1>, <type> <op2>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -13030,7 +13083,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.frem(<type> <op1>, <type> <op2>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -13059,7 +13112,7 @@ Semantics:
The value produced is the floating point remainder from the division of the two
value operands and has the same type as the operands. The remainder has the
-same sign as the dividend.
+same sign as the dividend.
'``llvm.experimental.constrained.fma``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -13119,7 +13172,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.sqrt(<type> <op1>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -13156,7 +13209,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.pow(<type> <op1>, <type> <op2>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -13193,7 +13246,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.powi(<type> <op1>, i32 <op2>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -13232,7 +13285,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.sin(<type> <op1>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -13268,7 +13321,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.cos(<type> <op1>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -13304,7 +13357,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.exp(<type> <op1>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -13339,7 +13392,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.exp2(<type> <op1>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -13375,7 +13428,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.log(<type> <op1>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -13411,7 +13464,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.log10(<type> <op1>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -13446,7 +13499,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.log2(<type> <op1>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -13481,7 +13534,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.rint(<type> <op1>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -13520,7 +13573,7 @@ Syntax:
::
- declare <type>
+ declare <type>
@llvm.experimental.constrained.nearbyint(<type> <op1>,
metadata <rounding mode>,
metadata <exception behavior>)
@@ -14281,7 +14334,7 @@ The '``llvm.memcpy.element.unordered.atomic.*``' intrinsic copies ``len`` bytes
memory from the source location to the destination location. These locations are not
allowed to overlap. The memory copy is performed as a sequence of load/store operations
where each access is guaranteed to be a multiple of ``element_size`` bytes wide and
-aligned at an ``element_size`` boundary.
+aligned at an ``element_size`` boundary.
The order of the copy is unspecified. The same value may be read from the source
buffer many times, but only one write is issued to the destination buffer per
@@ -14356,7 +14409,7 @@ The '``llvm.memmove.element.unordered.atomic.*``' intrinsic copies ``len`` bytes
of memory from the source location to the destination location. These locations
are allowed to overlap. The memory copy is performed as a sequence of load/store
operations where each access is guaranteed to be a multiple of ``element_size``
-bytes wide and aligned at an ``element_size`` boundary.
+bytes wide and aligned at an ``element_size`` boundary.
The order of the copy is unspecified. The same value may be read from the source
buffer many times, but only one write is issued to the destination buffer per
@@ -14431,7 +14484,7 @@ Semantics:
The '``llvm.memset.element.unordered.atomic.*``' intrinsic sets the ``len`` bytes of
memory starting at the destination location to the given ``value``. The memory is
set with a sequence of store operations where each access is guaranteed to be a
-multiple of ``element_size`` bytes wide and aligned at an ``element_size`` boundary.
+multiple of ``element_size`` bytes wide and aligned at an ``element_size`` boundary.
The order of the assignment is unspecified. Only one write is issued to the
destination buffer per element. It is well defined to have concurrent reads and
diff --git a/docs/SourceLevelDebugging.rst b/docs/SourceLevelDebugging.rst
index 491171393ab..103c6e0365b 100644
--- a/docs/SourceLevelDebugging.rst
+++ b/docs/SourceLevelDebugging.rst
@@ -188,7 +188,7 @@ the variable. The third argument is a `complex expression
<LangRef.html#diexpression>`_. An `llvm.dbg.addr` intrinsic describes the
*address* of a source variable.
-.. code-block:: llvm
+.. code-block:: text
%i.addr = alloca i32, align 4
call void @llvm.dbg.addr(metadata i32* %i.addr, metadata !1,
diff --git a/docs/WritingAnLLVMPass.rst b/docs/WritingAnLLVMPass.rst
index 54b3630e655..41f400740e8 100644
--- a/docs/WritingAnLLVMPass.rst
+++ b/docs/WritingAnLLVMPass.rst
@@ -1032,7 +1032,7 @@ implementation for the interface.
Pass Statistics
===============
-The `Statistic <http://llvm.org/doxygen/Statistic_8h-source.html>`_ class is
+The `Statistic <http://llvm.org/doxygen/Statistic_8h_source.html>`_ class is
designed to be an easy way to expose various success metrics from passes.
These statistics are printed at the end of a run, when the :option:`-stats`
command line option is enabled on the command line. See the :ref:`Statistics
@@ -1043,7 +1043,7 @@ section <Statistic>` in the Programmer's Manual for details.
What PassManager does
---------------------
-The `PassManager <http://llvm.org/doxygen/PassManager_8h-source.html>`_ `class
+The `PassManager <http://llvm.org/doxygen/PassManager_8h_source.html>`_ `class
<http://llvm.org/doxygen/classllvm_1_1PassManager.html>`_ takes a list of
passes, ensures their :ref:`prerequisites <writing-an-llvm-pass-interaction>`
are set up correctly, and then schedules passes to run efficiently. All of the
diff --git a/docs/index.rst b/docs/index.rst
index 955607a751c..47c2f047393 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -68,6 +68,7 @@ representation.
CMakePrimer
AdvancedBuilds
HowToBuildOnARM
+ HowToCrossCompileBuiltinsOnArm
HowToCrossCompileLLVM
CommandGuide/index
GettingStarted
@@ -105,6 +106,9 @@ representation.
:doc:`HowToBuildOnARM`
Notes on building and testing LLVM/Clang on ARM.
+:doc:`HowToCrossCompileBuiltinsOnArm`
+ Notes on cross-building and testing the compiler-rt builtins for Arm.
+
:doc:`HowToCrossCompileLLVM`
Notes on cross-building and testing LLVM/Clang.
diff --git a/include/llvm-c/DebugInfo.h b/include/llvm-c/DebugInfo.h
index 15f6b57d883..d17c690be4d 100644
--- a/include/llvm-c/DebugInfo.h
+++ b/include/llvm-c/DebugInfo.h
@@ -14,13 +14,18 @@
///
//===----------------------------------------------------------------------===//
+#ifndef LLVM_C_DEBUGINFO_H
+#define LLVM_C_DEBUGINFO_H
+
#include "llvm-c/Core.h"
#ifdef __cplusplus
extern "C" {
#endif
-/// Debug info flags.
+/**
+ * Debug info flags.
+ */
typedef enum {
LLVMDIFlagZero = 0,
LLVMDIFlagPrivate = 1,
@@ -55,7 +60,9 @@ typedef enum {
LLVMDIFlagVirtualInheritance
} LLVMDIFlags;
-/// Source languages known by DWARF.
+/**
+ * Source languages known by DWARF.
+ */
typedef enum {
LLVMDWARFSourceLanguageC89,
LLVMDWARFSourceLanguageC,
@@ -103,68 +110,85 @@ typedef enum {
LLVMDWARFSourceLanguageBORLAND_Delphi
} LLVMDWARFSourceLanguage;
-/// The amount of debug information to emit.
+/**
+ * The amount of debug information to emit.
+ */
typedef enum {
LLVMDWARFEmissionNone = 0,
LLVMDWARFEmissionFull,
LLVMDWARFEmissionLineTablesOnly
} LLVMDWARFEmissionKind;
-/// The current debug metadata version number.
+/**
+ * The current debug metadata version number.
+ */
unsigned LLVMDebugMetadataVersion(void);
-/// The version of debug metadata that's present in the provided \c Module.
+/**
+ * The version of debug metadata that's present in the provided \c Module.
+ */
unsigned LLVMGetModuleDebugMetadataVersion(LLVMModuleRef Module);
-/// Strip debug info in the module if it exists.
-///
-/// To do this, we remove all calls to the debugger intrinsics and any named
-/// metadata for debugging. We also remove debug locations for instructions.
-/// Return true if module is modified.
+/**
+ * Strip debug info in the module if it exists.
+ * To do this, we remove all calls to the debugger intrinsics and any named
+ * metadata for debugging. We also remove debug locations for instructions.
+ * Return true if module is modified.
+ */
LLVMBool LLVMStripModuleDebugInfo(LLVMModuleRef Module);
-/// Construct a builder for a module, and do not allow for unresolved nodes
-/// attached to the module.
+/**
+ * Construct a builder for a module, and do not allow for unresolved nodes
+ * attached to the module.
+ */
LLVMDIBuilderRef LLVMCreateDIBuilderDisallowUnresolved(LLVMModuleRef M);
-/// Construct a builder for a module and collect unresolved nodes attached
-/// to the module in order to resolve cycles during a call to
-/// \c LLVMDIBuilderFinalize.
+/**
+ * Construct a builder for a module and collect unresolved nodes attached
+ * to the module in order to resolve cycles during a call to
+ * \c LLVMDIBuilderFinalize.
+ */
LLVMDIBuilderRef LLVMCreateDIBuilder(LLVMModuleRef M);
-/// Deallocates the DIBuilder and everything it owns.
-/// @note You must call \c LLVMDIBuilderFinalize before this
+/**
+ * Deallocates the \c DIBuilder and everything it owns.
+ * @note You must call \c LLVMDIBuilderFinalize before this
+ */
void LLVMDisposeDIBuilder(LLVMDIBuilderRef Builder);
-/// Construct any deferred debug info descriptors.
+/**
+ * Construct any deferred debug info descriptors.
+ */
void LLVMDIBuilderFinalize(LLVMDIBuilderRef Builder);
-/// A CompileUnit provides an anchor for all debugging
-/// information generated during this instance of compilation.
-/// \param Lang Source programming language, eg.
-/// \c LLVMDWARFSourceLanguageC99
-/// \param FileRef File info.
-/// \param Producer Identify the producer of debugging information
-/// and code. Usually this is a compiler
-/// version string.
-/// \param ProducerLen The length of the C string passed to \c Producer.
-/// \param isOptimized A boolean flag which indicates whether optimization
-/// is enabled or not.
-/// \param Flags This string lists command line options. This
-/// string is directly embedded in debug info
-/// output which may be used by a tool
-/// analyzing generated debugging information.
-/// \param FlagsLen The length of the C string passed to \c Flags.
-/// \param RuntimeVer This indicates runtime version for languages like
-/// Objective-C.
-/// \param SplitName The name of the file that we'll split debug info
-/// out into.
-/// \param SplitNameLen The length of the C string passed to \c SplitName.
-/// \param Kind The kind of debug information to generate.
-/// \param DWOId The DWOId if this is a split skeleton compile unit.
-/// \param SplitDebugInlining Whether to emit inline debug info.
-/// \param DebugInfoForProfiling Whether to emit extra debug info for
-/// profile collection.
+/**
+ * A CompileUnit provides an anchor for all debugging
+ * information generated during this instance of compilation.
+ * \param Lang Source programming language, eg.
+ * \c LLVMDWARFSourceLanguageC99
+ * \param FileRef File info.
+ * \param Producer Identify the producer of debugging information
+ * and code. Usually this is a compiler
+ * version string.
+ * \param ProducerLen The length of the C string passed to \c Producer.
+ * \param isOptimized A boolean flag which indicates whether optimization
+ * is enabled or not.
+ * \param Flags This string lists command line options. This
+ * string is directly embedded in debug info
+ * output which may be used by a tool
+ * analyzing generated debugging information.
+ * \param FlagsLen The length of the C string passed to \c Flags.
+ * \param RuntimeVer This indicates runtime version for languages like
+ * Objective-C.
+ * \param SplitName The name of the file that we'll split debug info
+ * out into.
+ * \param SplitNameLen The length of the C string passed to \c SplitName.
+ * \param Kind The kind of debug information to generate.
+ * \param DWOId The DWOId if this is a split skeleton compile unit.
+ * \param SplitDebugInlining Whether to emit inline debug info.
+ * \param DebugInfoForProfiling Whether to emit extra debug info for
+ * profile collection.
+ */
LLVMMetadataRef LLVMDIBuilderCreateCompileUnit(
LLVMDIBuilderRef Builder, LLVMDWARFSourceLanguage Lang,
LLVMMetadataRef FileRef, const char *Producer, size_t ProducerLen,
@@ -173,30 +197,36 @@ LLVMMetadataRef LLVMDIBuilderCreateCompileUnit(
LLVMDWARFEmissionKind Kind, unsigned DWOId, LLVMBool SplitDebugInlining,
LLVMBool DebugInfoForProfiling);
-/// Create a file descriptor to hold debugging information for a file.
-/// \param Builder The DIBuilder.
-/// \param Filename File name.
-/// \param FilenameLen The length of the C string passed to \c Filename.
-/// \param Directory Directory.
-/// \param DirectoryLen The length of the C string passed to \c Directory.
+/**
+ * Create a file descriptor to hold debugging information for a file.
+ * \param Builder The \c DIBuilder.
+ * \param Filename File name.
+ * \param FilenameLen The length of the C string passed to \c Filename.
+ * \param Directory Directory.
+ * \param DirectoryLen The length of the C string passed to \c Directory.
+ */
LLVMMetadataRef
LLVMDIBuilderCreateFile(LLVMDIBuilderRef Builder, const char *Filename,
size_t FilenameLen, const char *Directory,
size_t DirectoryLen);
-/// Creates a new DebugLocation that describes a source location.
-/// \param Line The line in the source file.
-/// \param Column The column in the source file.
-/// \param Scope The scope in which the location resides.
-/// \param InlinedAt The scope where this location was inlined, if at all.
-/// (optional).
-/// \note If the item to which this location is attached cannot be
-/// attributed to a source line, pass 0 for the line and column.
+/**
+ * Creates a new DebugLocation that describes a source location.
+ * \param Line The line in the source file.
+ * \param Column The column in the source file.
+ * \param Scope The scope in which the location resides.
+ * \param InlinedAt The scope where this location was inlined, if at all.
+ * (optional).
+ * \note If the item to which this location is attached cannot be
+ * attributed to a source line, pass 0 for the line and column.
+ */
LLVMMetadataRef
LLVMDIBuilderCreateDebugLocation(LLVMContextRef Ctx, unsigned Line,
unsigned Column, LLVMMetadataRef Scope,
LLVMMetadataRef InlinedAt);
#ifdef __cplusplus
-} // end extern "C"
+} /* end extern "C" */
+#endif
+
#endif
diff --git a/include/llvm/ADT/MapVector.h b/include/llvm/ADT/MapVector.h
index 26a555ee1d3..3d78f4b203c 100644
--- a/include/llvm/ADT/MapVector.h
+++ b/include/llvm/ADT/MapVector.h
@@ -56,6 +56,13 @@ public:
size_type size() const { return Vector.size(); }
+ /// Grow the MapVector so that it can contain at least \p NumEntries items
+ /// before resizing again.
+ void reserve(size_type NumEntries) {
+ Map.reserve(NumEntries);
+ Vector.reserve(NumEntries);
+ }
+
iterator begin() { return Vector.begin(); }
const_iterator begin() const { return Vector.begin(); }
iterator end() { return Vector.end(); }
diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index 3ec9dfe5de0..1be5bf91385 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -813,6 +813,13 @@ void DeleteContainerSeconds(Container &C) {
C.clear();
}
+/// Provide wrappers to std::for_each which take ranges instead of having to
+/// pass begin/end explicitly.
+template <typename R, typename UnaryPredicate>
+UnaryPredicate for_each(R &&Range, UnaryPredicate P) {
+ return std::for_each(std::begin(Range), std::end(Range), P);
+}
+
/// Provide wrappers to std::all_of which take ranges instead of having to pass
/// begin/end explicitly.
template <typename R, typename UnaryPredicate>
diff --git a/include/llvm/Analysis/BlockFrequencyInfo.h b/include/llvm/Analysis/BlockFrequencyInfo.h
index d663b09d5cf..89370cbeeea 100644
--- a/include/llvm/Analysis/BlockFrequencyInfo.h
+++ b/include/llvm/Analysis/BlockFrequencyInfo.h
@@ -75,6 +75,10 @@ public:
/// the enclosing function's count (if available) and returns the value.
Optional<uint64_t> getProfileCountFromFreq(uint64_t Freq) const;
+ /// \brief Returns true if \p BB is an irreducible loop header
+ /// block. Otherwise false.
+ bool isIrrLoopHeader(const BasicBlock *BB);
+
// Set the frequency of the given basic block.
void setBlockFreq(const BasicBlock *BB, uint64_t Freq);
diff --git a/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/include/llvm/Analysis/BlockFrequencyInfoImpl.h
index 7f166f4a646..7b916e3653b 100644
--- a/include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ b/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -20,6 +20,7 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/IR/BasicBlock.h"
@@ -414,6 +415,10 @@ public:
/// \brief Data about each block. This is used downstream.
std::vector<FrequencyData> Freqs;
+ /// \brief Whether each block is an irreducible loop header.
+ /// This is used downstream.
+ SparseBitVector<> IsIrrLoopHeader;
+
/// \brief Loop data: see initializeLoops().
std::vector<WorkingData> Working;
@@ -492,6 +497,8 @@ public:
/// the backedges going into each of the loop headers.
void adjustLoopHeaderMass(LoopData &Loop);
+ void distributeIrrLoopHeaderMass(Distribution &Dist);
+
/// \brief Package up a loop.
void packageLoop(LoopData &Loop);
@@ -520,6 +527,7 @@ public:
const BlockNode &Node) const;
Optional<uint64_t> getProfileCountFromFreq(const Function &F,
uint64_t Freq) const;
+ bool isIrrLoopHeader(const BlockNode &Node);
void setBlockFreq(const BlockNode &Node, uint64_t Freq);
@@ -973,6 +981,10 @@ public:
return BlockFrequencyInfoImplBase::getProfileCountFromFreq(F, Freq);
}
+ bool isIrrLoopHeader(const BlockT *BB) {
+ return BlockFrequencyInfoImplBase::isIrrLoopHeader(getNode(BB));
+ }
+
void setBlockFreq(const BlockT *BB, uint64_t Freq);
Scaled64 getFloatingBlockFreq(const BlockT *BB) const {
@@ -1140,17 +1152,39 @@ bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n");
if (Loop.isIrreducible()) {
- BlockMass Remaining = BlockMass::getFull();
+ DEBUG(dbgs() << "isIrreducible = true\n");
+ Distribution Dist;
+ unsigned NumHeadersWithWeight = 0;
for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
- auto &Mass = Working[Loop.Nodes[H].Index].getMass();
- Mass = Remaining * BranchProbability(1, Loop.NumHeaders - H);
- Remaining -= Mass;
+ auto &HeaderNode = Loop.Nodes[H];
+ const BlockT *Block = getBlock(HeaderNode);
+ IsIrrLoopHeader.set(Loop.Nodes[H].Index);
+ Optional<uint64_t> HeaderWeight = Block->getIrrLoopHeaderWeight();
+ if (!HeaderWeight)
+ continue;
+ DEBUG(dbgs() << getBlockName(HeaderNode)
+ << " has irr loop header weight " << HeaderWeight.getValue()
+ << "\n");
+ NumHeadersWithWeight++;
+ uint64_t HeaderWeightValue = HeaderWeight.getValue();
+ if (HeaderWeightValue)
+ Dist.addLocal(HeaderNode, HeaderWeightValue);
}
+ if (NumHeadersWithWeight != Loop.NumHeaders) {
+ // Not all headers have a weight metadata. Distribute weight evenly.
+ Dist = Distribution();
+ for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
+ auto &HeaderNode = Loop.Nodes[H];
+ Dist.addLocal(HeaderNode, 1);
+ }
+ }
+ distributeIrrLoopHeaderMass(Dist);
for (const BlockNode &M : Loop.Nodes)
if (!propagateMassToSuccessors(&Loop, M))
llvm_unreachable("unhandled irreducible control flow");
-
- adjustLoopHeaderMass(Loop);
+ if (NumHeadersWithWeight != Loop.NumHeaders)
+ // Not all headers have a weight metadata. Adjust header mass.
+ adjustLoopHeaderMass(Loop);
} else {
Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
if (!propagateMassToSuccessors(&Loop, Loop.getHeader()))
@@ -1285,6 +1319,9 @@ raw_ostream &BlockFrequencyInfoImpl<BT>::print(raw_ostream &OS) const {
BlockFrequencyInfoImplBase::getBlockProfileCount(
*F->getFunction(), getNode(&BB)))
OS << ", count = " << ProfileCount.getValue();
+ if (Optional<uint64_t> IrrLoopHeaderWeight =
+ BB.getIrrLoopHeaderWeight())
+ OS << ", irr_loop_header_weight = " << IrrLoopHeaderWeight.getValue();
OS << "\n";
}
diff --git a/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h b/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
index ae9396d9c21..84b6ec9beea 100644
--- a/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
+++ b/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
@@ -22,11 +22,11 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include <cassert>
diff --git a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index b229411c814..4055ab11291 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -26,6 +26,7 @@
#include <cassert>
#include <tuple>
#include <utility>
+#include <unordered_map>
namespace llvm {
@@ -120,27 +121,144 @@ public:
}
}
+ typedef std::pair<uint16_t, LegalizeAction> SizeAndAction;
+ typedef std::vector<SizeAndAction> SizeAndActionsVec;
+ using SizeChangeStrategy =
+ std::function<SizeAndActionsVec(const SizeAndActionsVec &v)>;
+
/// More friendly way to set an action for common types that have an LLT
/// representation.
+ /// The LegalizeAction must be one for which NeedsLegalizingToDifferentSize
+ /// returns false.
void setAction(const InstrAspect &Aspect, LegalizeAction Action) {
+ assert(!needsLegalizingToDifferentSize(Action));
TablesInitialized = false;
- unsigned Opcode = Aspect.Opcode - FirstOp;
- if (Actions[Opcode].size() <= Aspect.Idx)
- Actions[Opcode].resize(Aspect.Idx + 1);
- Actions[Aspect.Opcode - FirstOp][Aspect.Idx][Aspect.Type] = Action;
+ const unsigned OpcodeIdx = Aspect.Opcode - FirstOp;
+ if (SpecifiedActions[OpcodeIdx].size() <= Aspect.Idx)
+ SpecifiedActions[OpcodeIdx].resize(Aspect.Idx + 1);
+ SpecifiedActions[OpcodeIdx][Aspect.Idx][Aspect.Type] = Action;
}
- /// If an operation on a given vector type (say <M x iN>) isn't explicitly
- /// specified, we proceed in 2 stages. First we legalize the underlying scalar
- /// (so that there's at least one legal vector with that scalar), then we
- /// adjust the number of elements in the vector so that it is legal. The
- /// desired action in the first step is controlled by this function.
- void setScalarInVectorAction(unsigned Opcode, LLT ScalarTy,
- LegalizeAction Action) {
- assert(!ScalarTy.isVector());
- ScalarInVectorActions[std::make_pair(Opcode, ScalarTy)] = Action;
+ /// The setAction calls record the non-size-changing legalization actions
+ /// to take on specificly-sized types. The SizeChangeStrategy defines what
+ /// to do when the size of the type needs to be changed to reach a legally
+ /// sized type (i.e., one that was defined through a setAction call).
+ /// e.g.
+ /// setAction ({G_ADD, 0, LLT::scalar(32)}, Legal);
+ /// setLegalizeScalarToDifferentSizeStrategy(
+ /// G_ADD, 0, widenToLargerTypesAndNarrowToLargest);
+ /// will end up defining getAction({G_ADD, 0, T}) to return the following
+ /// actions for different scalar types T:
+ /// LLT::scalar(1)..LLT::scalar(31): {WidenScalar, 0, LLT::scalar(32)}
+ /// LLT::scalar(32): {Legal, 0, LLT::scalar(32)}
+ /// LLT::scalar(33)..: {NarrowScalar, 0, LLT::scalar(32)}
+ ///
+ /// If no SizeChangeAction gets defined, through this function,
+ /// the default is unsupportedForDifferentSizes.
+ void setLegalizeScalarToDifferentSizeStrategy(const unsigned Opcode,
+ const unsigned TypeIdx,
+ SizeChangeStrategy S) {
+ const unsigned OpcodeIdx = Opcode - FirstOp;
+ if (ScalarSizeChangeStrategies[OpcodeIdx].size() <= TypeIdx)
+ ScalarSizeChangeStrategies[OpcodeIdx].resize(TypeIdx + 1);
+ ScalarSizeChangeStrategies[OpcodeIdx][TypeIdx] = S;
+ }
+
+ /// See also setLegalizeScalarToDifferentSizeStrategy.
+ /// This function allows to set the SizeChangeStrategy for vector elements.
+ void setLegalizeVectorElementToDifferentSizeStrategy(const unsigned Opcode,
+ const unsigned TypeIdx,
+ SizeChangeStrategy S) {
+ const unsigned OpcodeIdx = Opcode - FirstOp;
+ if (VectorElementSizeChangeStrategies[OpcodeIdx].size() <= TypeIdx)
+ VectorElementSizeChangeStrategies[OpcodeIdx].resize(TypeIdx + 1);
+ VectorElementSizeChangeStrategies[OpcodeIdx][TypeIdx] = S;
+ }
+
+ /// A SizeChangeStrategy for the common case where legalization for a
+ /// particular operation consists of only supporting a specific set of type
+ /// sizes. E.g.
+ /// setAction ({G_DIV, 0, LLT::scalar(32)}, Legal);
+ /// setAction ({G_DIV, 0, LLT::scalar(64)}, Legal);
+ /// setLegalizeScalarToDifferentSizeStrategy(
+ /// G_DIV, 0, unsupportedForDifferentSizes);
+ /// will result in getAction({G_DIV, 0, T}) to return Legal for s32 and s64,
+ /// and Unsupported for all other scalar types T.
+ static SizeAndActionsVec
+ unsupportedForDifferentSizes(const SizeAndActionsVec &v) {
+ return increaseToLargerTypesAndDecreaseToLargest(v, Unsupported,
+ Unsupported);
+ }
+
+ /// A SizeChangeStrategy for the common case where legalization for a
+ /// particular operation consists of widening the type to a large legal type,
+ /// unless there is no such type and then instead it should be narrowed to the
+ /// largest legal type.
+ static SizeAndActionsVec
+ widenToLargerTypesAndNarrowToLargest(const SizeAndActionsVec &v) {
+ assert(v.size() > 0 &&
+ "At least one size that can be legalized towards is needed"
+ " for this SizeChangeStrategy");
+ return increaseToLargerTypesAndDecreaseToLargest(v, WidenScalar,
+ NarrowScalar);
+ }
+
+ static SizeAndActionsVec
+ widenToLargerTypesUnsupportedOtherwise(const SizeAndActionsVec &v) {
+ return increaseToLargerTypesAndDecreaseToLargest(v, WidenScalar,
+ Unsupported);
+ }
+
+ static SizeAndActionsVec
+ narrowToSmallerAndUnsupportedIfTooSmall(const SizeAndActionsVec &v) {
+ return decreaseToSmallerTypesAndIncreaseToSmallest(v, NarrowScalar,
+ Unsupported);
+ }
+
+ static SizeAndActionsVec
+ narrowToSmallerAndWidenToSmallest(const SizeAndActionsVec &v) {
+ assert(v.size() > 0 &&
+ "At least one size that can be legalized towards is needed"
+ " for this SizeChangeStrategy");
+ return decreaseToSmallerTypesAndIncreaseToSmallest(v, NarrowScalar,
+ WidenScalar);
+ }
+
+ /// A SizeChangeStrategy for the common case where legalization for a
+ /// particular vector operation consists of having more elements in the
+ /// vector, to a type that is legal. Unless there is no such type and then
+ /// instead it should be legalized towards the widest vector that's still
+ /// legal. E.g.
+ /// setAction({G_ADD, LLT::vector(8, 8)}, Legal);
+ /// setAction({G_ADD, LLT::vector(16, 8)}, Legal);
+ /// setAction({G_ADD, LLT::vector(2, 32)}, Legal);
+ /// setAction({G_ADD, LLT::vector(4, 32)}, Legal);
+ /// setLegalizeVectorElementToDifferentSizeStrategy(
+ /// G_ADD, 0, moreToWiderTypesAndLessToWidest);
+ /// will result in the following getAction results:
+ /// * getAction({G_ADD, LLT::vector(8,8)}) returns
+ /// (Legal, vector(8,8)).
+ /// * getAction({G_ADD, LLT::vector(9,8)}) returns
+ /// (MoreElements, vector(16,8)).
+ /// * getAction({G_ADD, LLT::vector(8,32)}) returns
+ /// (FewerElements, vector(4,32)).
+ static SizeAndActionsVec
+ moreToWiderTypesAndLessToWidest(const SizeAndActionsVec &v) {
+ return increaseToLargerTypesAndDecreaseToLargest(v, MoreElements,
+ FewerElements);
}
+ /// Helper function to implement many typical SizeChangeStrategy functions.
+ static SizeAndActionsVec
+ increaseToLargerTypesAndDecreaseToLargest(const SizeAndActionsVec &v,
+ LegalizeAction IncreaseAction,
+ LegalizeAction DecreaseAction);
+ /// Helper function to implement many typical SizeChangeStrategy functions.
+ static SizeAndActionsVec
+ decreaseToSmallerTypesAndIncreaseToSmallest(const SizeAndActionsVec &v,
+ LegalizeAction DecreaseAction,
+ LegalizeAction IncreaseAction);
+
/// Determine what action should be taken to legalize the given generic
/// instruction opcode, type-index and type. Requires computeTables to have
/// been called.
@@ -158,58 +276,6 @@ public:
std::tuple<LegalizeAction, unsigned, LLT>
getAction(const MachineInstr &MI, const MachineRegisterInfo &MRI) const;
- /// Iterate the given function (typically something like doubling the width)
- /// on Ty until we find a legal type for this operation.
- Optional<LLT> findLegalizableSize(const InstrAspect &Aspect,
- function_ref<LLT(LLT)> NextType) const {
- if (Aspect.Idx >= Actions[Aspect.Opcode - FirstOp].size())
- return None;
-
- LegalizeAction Action;
- const TypeMap &Map = Actions[Aspect.Opcode - FirstOp][Aspect.Idx];
- LLT Ty = Aspect.Type;
- do {
- Ty = NextType(Ty);
- auto ActionIt = Map.find(Ty);
- if (ActionIt == Map.end()) {
- auto DefaultIt = DefaultActions.find(Aspect.Opcode);
- if (DefaultIt == DefaultActions.end())
- return None;
- Action = DefaultIt->second;
- } else
- Action = ActionIt->second;
- } while (needsLegalizingToDifferentSize(Action));
- return Ty;
- }
-
- /// Find what type it's actually OK to perform the given operation on, given
- /// the general approach we've decided to take.
- Optional<LLT> findLegalType(const InstrAspect &Aspect, LegalizeAction Action) const;
-
- std::pair<LegalizeAction, LLT> findLegalAction(const InstrAspect &Aspect,
- LegalizeAction Action) const {
- auto LegalType = findLegalType(Aspect, Action);
- if (!LegalType)
- return std::make_pair(LegalizeAction::Unsupported, LLT());
- return std::make_pair(Action, *LegalType);
- }
-
- /// Find the specified \p Aspect in the primary (explicitly set) Actions
- /// table. Returns either the action the target requested or NotFound if there
- /// was no setAction call.
- LegalizeAction findInActions(const InstrAspect &Aspect) const {
- if (Aspect.Opcode < FirstOp || Aspect.Opcode > LastOp)
- return NotFound;
- if (Aspect.Idx >= Actions[Aspect.Opcode - FirstOp].size())
- return NotFound;
- const TypeMap &Map = Actions[Aspect.Opcode - FirstOp][Aspect.Idx];
- auto ActionIt = Map.find(Aspect.Type);
- if (ActionIt == Map.end())
- return NotFound;
-
- return ActionIt->second;
- }
-
bool isLegal(const MachineInstr &MI, const MachineRegisterInfo &MRI) const;
virtual bool legalizeCustom(MachineInstr &MI,
@@ -217,20 +283,181 @@ public:
MachineIRBuilder &MIRBuilder) const;
private:
- static const int FirstOp = TargetOpcode::PRE_ISEL_GENERIC_OPCODE_START;
- static const int LastOp = TargetOpcode::PRE_ISEL_GENERIC_OPCODE_END;
+ /// The SizeAndActionsVec is a representation mapping between all natural
+ /// numbers and an Action. The natural number represents the bit size of
+ /// the InstrAspect. For example, for a target with native support for 32-bit
+ /// and 64-bit additions, you'd express that as:
+ /// setScalarAction(G_ADD, 0,
+ /// {{1, WidenScalar}, // bit sizes [ 1, 31[
+ /// {32, Legal}, // bit sizes [32, 33[
+ /// {33, WidenScalar}, // bit sizes [33, 64[
+ /// {64, Legal}, // bit sizes [64, 65[
+ /// {65, NarrowScalar} // bit sizes [65, +inf[
+ /// });
+ /// It may be that only 64-bit pointers are supported on your target:
+ /// setPointerAction(G_GEP, 0, LLT:pointer(1),
+ /// {{1, Unsupported}, // bit sizes [ 1, 63[
+ /// {64, Legal}, // bit sizes [64, 65[
+ /// {65, Unsupported}, // bit sizes [65, +inf[
+ /// });
+ void setScalarAction(const unsigned Opcode, const unsigned TypeIndex,
+ const SizeAndActionsVec &SizeAndActions) {
+ const unsigned OpcodeIdx = Opcode - FirstOp;
+ SmallVector<SizeAndActionsVec, 1> &Actions = ScalarActions[OpcodeIdx];
+ setActions(TypeIndex, Actions, SizeAndActions);
+ }
+ void setPointerAction(const unsigned Opcode, const unsigned TypeIndex,
+ const unsigned AddressSpace,
+ const SizeAndActionsVec &SizeAndActions) {
+ const unsigned OpcodeIdx = Opcode - FirstOp;
+ if (AddrSpace2PointerActions[OpcodeIdx].find(AddressSpace) ==
+ AddrSpace2PointerActions[OpcodeIdx].end())
+ AddrSpace2PointerActions[OpcodeIdx][AddressSpace] = {{}};
+ SmallVector<SizeAndActionsVec, 1> &Actions =
+ AddrSpace2PointerActions[OpcodeIdx].find(AddressSpace)->second;
+ setActions(TypeIndex, Actions, SizeAndActions);
+ }
+
+ /// If an operation on a given vector type (say <M x iN>) isn't explicitly
+ /// specified, we proceed in 2 stages. First we legalize the underlying scalar
+ /// (so that there's at least one legal vector with that scalar), then we
+ /// adjust the number of elements in the vector so that it is legal. The
+ /// desired action in the first step is controlled by this function.
+ void setScalarInVectorAction(const unsigned Opcode, const unsigned TypeIndex,
+ const SizeAndActionsVec &SizeAndActions) {
+ unsigned OpcodeIdx = Opcode - FirstOp;
+ SmallVector<SizeAndActionsVec, 1> &Actions =
+ ScalarInVectorActions[OpcodeIdx];
+ setActions(TypeIndex, Actions, SizeAndActions);
+ }
+
+ /// See also setScalarInVectorAction.
+ /// This function let's you specify the number of elements in a vector that
+ /// are legal for a legal element size.
+ void setVectorNumElementAction(const unsigned Opcode,
+ const unsigned TypeIndex,
+ const unsigned ElementSize,
+ const SizeAndActionsVec &SizeAndActions) {
+ const unsigned OpcodeIdx = Opcode - FirstOp;
+ if (NumElements2Actions[OpcodeIdx].find(ElementSize) ==
+ NumElements2Actions[OpcodeIdx].end())
+ NumElements2Actions[OpcodeIdx][ElementSize] = {{}};
+ SmallVector<SizeAndActionsVec, 1> &Actions =
+ NumElements2Actions[OpcodeIdx].find(ElementSize)->second;
+ setActions(TypeIndex, Actions, SizeAndActions);
+ }
+
+ /// A partial SizeAndActionsVec potentially doesn't cover all bit sizes,
+ /// i.e. it's OK if it doesn't start from size 1.
+ static void checkPartialSizeAndActionsVector(const SizeAndActionsVec& v) {
+#ifndef NDEBUG
+ // The sizes should be in increasing order
+ int prev_size = -1;
+ for(auto SizeAndAction: v) {
+ assert(SizeAndAction.first > prev_size);
+ prev_size = SizeAndAction.first;
+ }
+ // - for every Widen action, there should be a larger bitsize that
+ // can be legalized towards (e.g. Legal, Lower, Libcall or Custom
+ // action).
+ // - for every Narrow action, there should be a smaller bitsize that
+ // can be legalized towards.
+ int SmallestNarrowIdx = -1;
+ int LargestWidenIdx = -1;
+ int SmallestLegalizableToSameSizeIdx = -1;
+ int LargestLegalizableToSameSizeIdx = -1;
+ for(size_t i=0; i<v.size(); ++i) {
+ switch (v[i].second) {
+ case FewerElements:
+ case NarrowScalar:
+ if (SmallestNarrowIdx == -1)
+ SmallestNarrowIdx = i;
+ break;
+ case WidenScalar:
+ case MoreElements:
+ LargestWidenIdx = i;
+ break;
+ case Unsupported:
+ break;
+ default:
+ if (SmallestLegalizableToSameSizeIdx == -1)
+ SmallestLegalizableToSameSizeIdx = i;
+ LargestLegalizableToSameSizeIdx = i;
+ }
+ }
+ if (SmallestNarrowIdx != -1) {
+ assert(SmallestLegalizableToSameSizeIdx != -1);
+ assert(SmallestNarrowIdx > SmallestLegalizableToSameSizeIdx);
+ }
+ if (LargestWidenIdx != -1)
+ assert(LargestWidenIdx < LargestLegalizableToSameSizeIdx);
+#endif
+ }
- using TypeMap = DenseMap<LLT, LegalizeAction>;
- using SIVActionMap = DenseMap<std::pair<unsigned, LLT>, LegalizeAction>;
+ /// A full SizeAndActionsVec must cover all bit sizes, i.e. must start with
+ /// from size 1.
+ static void checkFullSizeAndActionsVector(const SizeAndActionsVec& v) {
+#ifndef NDEBUG
+ // Data structure invariant: The first bit size must be size 1.
+ assert(v.size() >= 1);
+ assert(v[0].first == 1);
+ checkPartialSizeAndActionsVector(v);
+#endif
+ }
+
+ /// Sets actions for all bit sizes on a particular generic opcode, type
+ /// index and scalar or pointer type.
+ void setActions(unsigned TypeIndex,
+ SmallVector<SizeAndActionsVec, 1> &Actions,
+ const SizeAndActionsVec &SizeAndActions) {
+ checkFullSizeAndActionsVector(SizeAndActions);
+ if (Actions.size() <= TypeIndex)
+ Actions.resize(TypeIndex + 1);
+ Actions[TypeIndex] = SizeAndActions;
+ }
- SmallVector<TypeMap, 1> Actions[LastOp - FirstOp + 1];
- SIVActionMap ScalarInVectorActions;
- DenseMap<std::pair<unsigned, LLT>, uint16_t> MaxLegalVectorElts;
- DenseMap<unsigned, LegalizeAction> DefaultActions;
+ static SizeAndAction findAction(const SizeAndActionsVec &Vec,
+ const uint32_t Size);
+
+ /// Returns the next action needed to get the scalar or pointer type closer
+ /// to being legal
+ /// E.g. findLegalAction({G_REM, 13}) should return
+ /// (WidenScalar, 32). After that, findLegalAction({G_REM, 32}) will
+ /// probably be called, which should return (Lower, 32).
+ /// This is assuming the setScalarAction on G_REM was something like:
+ /// setScalarAction(G_REM, 0,
+ /// {{1, WidenScalar}, // bit sizes [ 1, 31[
+ /// {32, Lower}, // bit sizes [32, 33[
+ /// {33, NarrowScalar} // bit sizes [65, +inf[
+ /// });
+ std::pair<LegalizeAction, LLT>
+ findScalarLegalAction(const InstrAspect &Aspect) const;
+
+ /// Returns the next action needed towards legalizing the vector type.
+ std::pair<LegalizeAction, LLT>
+ findVectorLegalAction(const InstrAspect &Aspect) const;
+
+ static const int FirstOp = TargetOpcode::PRE_ISEL_GENERIC_OPCODE_START;
+ static const int LastOp = TargetOpcode::PRE_ISEL_GENERIC_OPCODE_END;
- bool TablesInitialized = false;
+ // Data structures used temporarily during construction of legality data:
+ typedef DenseMap<LLT, LegalizeAction> TypeMap;
+ SmallVector<TypeMap, 1> SpecifiedActions[LastOp - FirstOp + 1];
+ SmallVector<SizeChangeStrategy, 1>
+ ScalarSizeChangeStrategies[LastOp - FirstOp + 1];
+ SmallVector<SizeChangeStrategy, 1>
+ VectorElementSizeChangeStrategies[LastOp - FirstOp + 1];
+ bool TablesInitialized;
+
+ // Data structures used by getAction:
+ SmallVector<SizeAndActionsVec, 1> ScalarActions[LastOp - FirstOp + 1];
+ SmallVector<SizeAndActionsVec, 1> ScalarInVectorActions[LastOp - FirstOp + 1];
+ std::unordered_map<uint16_t, SmallVector<SizeAndActionsVec, 1>>
+ AddrSpace2PointerActions[LastOp - FirstOp + 1];
+ std::unordered_map<uint16_t, SmallVector<SizeAndActionsVec, 1>>
+ NumElements2Actions[LastOp - FirstOp + 1];
};
-} // end namespace llvm
+} // end namespace llvm.
#endif // LLVM_CODEGEN_GLOBALISEL_LEGALIZERINFO_H
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index 51a0d96deda..0f5b04d9045 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -97,6 +97,8 @@ private:
using const_probability_iterator =
std::vector<BranchProbability>::const_iterator;
+ Optional<uint64_t> IrrLoopHeaderWeight;
+
/// Keep track of the physical registers that are livein of the basicblock.
using LiveInVector = std::vector<RegisterMaskPair>;
LiveInVector LiveIns;
@@ -729,6 +731,14 @@ public:
/// Return the MCSymbol for this basic block.
MCSymbol *getSymbol() const;
+ Optional<uint64_t> getIrrLoopHeaderWeight() const {
+ return IrrLoopHeaderWeight;
+ }
+
+ void setIrrLoopHeaderWeight(uint64_t Weight) {
+ IrrLoopHeaderWeight = Weight;
+ }
+
private:
/// Return probability iterator corresponding to the I successor iterator.
probability_iterator getProbabilityIterator(succ_iterator I);
diff --git a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
index cba79c818a7..5b4b99ca0a5 100644
--- a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
+++ b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
@@ -62,6 +62,8 @@ public:
Optional<uint64_t> getBlockProfileCount(const MachineBasicBlock *MBB) const;
Optional<uint64_t> getProfileCountFromFreq(uint64_t Freq) const;
+ bool isIrrLoopHeader(const MachineBasicBlock *MBB);
+
const MachineFunction *getFunction() const;
const MachineBranchProbabilityInfo *getMBPI() const;
void view(const Twine &Name, bool isSimple = true) const;
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 7523825285a..88a697055e8 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -301,6 +301,21 @@ public:
return Operands[i];
}
+ /// Return true if operand \p OpIdx is a subregister index.
+ bool isOperandSubregIdx(unsigned OpIdx) const {
+ assert(getOperand(OpIdx).getType() == MachineOperand::MO_Immediate &&
+ "Expected MO_Immediate operand type.");
+ if (isExtractSubreg() && OpIdx == 2)
+ return true;
+ if (isInsertSubreg() && OpIdx == 3)
+ return true;
+ if (isRegSequence() && OpIdx > 1 && (OpIdx % 2) == 0)
+ return true;
+ if (isSubregToReg() && OpIdx == 3)
+ return true;
+ return false;
+ }
+
/// Returns the number of non-implicit operands.
unsigned getNumExplicitOperands() const;
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index 8e6b1570e4a..bf35b7d653b 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -417,6 +417,12 @@ namespace llvm {
/// shuffles.
FunctionPass *createExpandReductionsPass();
+ // This pass expands memcmp() to load/stores.
+ FunctionPass *createExpandMemCmpPass();
+
+ /// Creates CFI Instruction Inserter pass. \see CFIInstrInserter.cpp
+ FunctionPass *createCFIInstrInserter();
+
} // End llvm namespace
#endif
diff --git a/include/llvm/CodeGen/ResourcePriorityQueue.h b/include/llvm/CodeGen/ResourcePriorityQueue.h
index 1a4f994259d..cc64e9d572e 100644
--- a/include/llvm/CodeGen/ResourcePriorityQueue.h
+++ b/include/llvm/CodeGen/ResourcePriorityQueue.h
@@ -20,8 +20,8 @@
#include "llvm/CodeGen/DFAPacketizer.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCInstrItineraries.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
namespace llvm {
diff --git a/include/llvm/CodeGen/StackMaps.h b/include/llvm/CodeGen/StackMaps.h
index 8263946ed92..4407114d274 100644
--- a/include/llvm/CodeGen/StackMaps.h
+++ b/include/llvm/CodeGen/StackMaps.h
@@ -14,6 +14,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/IR/CallingConv.h"
+#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/Debug.h"
#include <algorithm>
#include <cassert>
@@ -25,7 +26,6 @@ namespace llvm {
class AsmPrinter;
class MCExpr;
class MCStreamer;
-class MCSymbol;
class raw_ostream;
class TargetRegisterInfo;
diff --git a/include/llvm/CodeGen/TailDuplicator.h b/include/llvm/CodeGen/TailDuplicator.h
index e5f110293c3..ea202b2e409 100644
--- a/include/llvm/CodeGen/TailDuplicator.h
+++ b/include/llvm/CodeGen/TailDuplicator.h
@@ -17,12 +17,12 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <utility>
#include <vector>
diff --git a/include/llvm/Target/TargetFrameLowering.h b/include/llvm/CodeGen/TargetFrameLowering.h
index 31017cbc27b..a94dbd7c5c0 100644
--- a/include/llvm/Target/TargetFrameLowering.h
+++ b/include/llvm/CodeGen/TargetFrameLowering.h
@@ -1,4 +1,4 @@
-//===-- llvm/Target/TargetFrameLowering.h ---------------------------*- C++ -*-===//
+//===-- llvm/CodeGen/TargetFrameLowering.h ---------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_TARGET_TARGETFRAMELOWERING_H
-#define LLVM_TARGET_TARGETFRAMELOWERING_H
+#ifndef LLVM_CODEGEN_TARGETFRAMELOWERING_H
+#define LLVM_CODEGEN_TARGETFRAMELOWERING_H
#include "llvm/CodeGen/MachineBasicBlock.h"
#include <utility>
@@ -341,6 +341,14 @@ public:
return false;
return true;
}
+
+ /// Return initial CFA offset value i.e. the one valid at the beginning of the
+ /// function (before any stack operations).
+ virtual int getInitialCFAOffset(const MachineFunction &MF) const;
+
+ /// Return initial CFA register value i.e. the one valid at the beginning of
+ /// the function (before any stack operations).
+ virtual unsigned getInitialCFARegister(const MachineFunction &MF) const;
};
} // End llvm namespace
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/CodeGen/TargetInstrInfo.h
index 5d230d820db..6770e503e61 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1,4 +1,4 @@
-//===- llvm/Target/TargetInstrInfo.h - Instruction Info ---------*- C++ -*-===//
+//===- llvm/CodeGen/TargetInstrInfo.h - Instruction Info --------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
index a4d8c0dd716..f89bcf82fee 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
@@ -22,6 +22,7 @@
namespace llvm {
+class DWARFUnit;
class raw_ostream;
class DWARFDebugLine {
@@ -95,7 +96,8 @@ public:
void clear();
void dump(raw_ostream &OS) const;
- bool parse(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr);
+ bool parse(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
+ const DWARFUnit *U = nullptr);
};
/// Standard .debug_line state machine structure.
@@ -218,7 +220,7 @@ public:
/// Parse prologue and all rows.
bool parse(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
- raw_ostream *OS = nullptr);
+ const DWARFUnit *U, raw_ostream *OS = nullptr);
using RowVector = std::vector<Row>;
using RowIter = RowVector::const_iterator;
@@ -236,7 +238,7 @@ public:
const LineTable *getLineTable(uint32_t Offset) const;
const LineTable *getOrParseLineTable(const DWARFDataExtractor &DebugLineData,
- uint32_t Offset);
+ uint32_t Offset, const DWARFUnit *U);
private:
struct ParsingState {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index 497fe591c96..d32053519ec 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
@@ -104,16 +104,12 @@ public:
const DWARFUnit *getUnit() const { return U; }
void dump(raw_ostream &OS, DIDumpOptions DumpOpts = DIDumpOptions()) const;
- /// Extracts a value in \p Data at offset \p *OffsetPtr.
- ///
- /// The passed DWARFUnit is allowed to be nullptr, in which case some
- /// kind of forms that depend on Unit information are disallowed.
- /// \param Data The DWARFDataExtractor to use.
- /// \param OffsetPtr The offset within \p Data where the data starts.
- /// \param U The optional DWARFUnit supplying information for some forms.
- /// \returns whether the extraction succeeded.
+ /// Extracts a value in \p Data at offset \p *OffsetPtr. The information
+ /// in \p FormParams is needed to interpret some forms. The optional
+ /// \p Unit allows extracting information if the form refers to other
+ /// sections (e.g., .debug_str).
bool extractValue(const DWARFDataExtractor &Data, uint32_t *OffsetPtr,
- const DWARFUnit *U);
+ DWARFFormParams FormParams, const DWARFUnit *U = nullptr);
bool isInlinedCStr() const {
return Value.data != nullptr && Value.data == (const uint8_t *)Value.cstr;
diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h
index 6714f2c9747..77cfc9776df 100644
--- a/include/llvm/IR/BasicBlock.h
+++ b/include/llvm/IR/BasicBlock.h
@@ -398,6 +398,8 @@ public:
/// \brief Return true if it is legal to hoist instructions into this block.
bool isLegalToHoistInto() const;
+ Optional<uint64_t> getIrrLoopHeaderWeight() const;
+
private:
/// \brief Increment the internal refcount of the number of BlockAddresses
/// referencing this BasicBlock by \p Amt.
diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h
index bee8cf8a39d..c515f6de2d8 100644
--- a/include/llvm/IR/DebugInfoMetadata.h
+++ b/include/llvm/IR/DebugInfoMetadata.h
@@ -1419,19 +1419,15 @@ public:
/// represented in a single line entry. In this case, no location
/// should be set, unless the merged instruction is a call, which we will
/// set the merged debug location as line 0 of the nearest common scope
- /// where 2 locations are inlined from. This only applies to Instruction,
- /// For MachineInstruction, as it is post-inline, we will treat the call
+ /// where 2 locations are inlined from. This only applies to Instruction;
+ /// for MachineInstruction, as it is post-inline, we will treat the call
/// instruction the same way as other instructions.
///
- /// This should only be used by MachineInstruction because call can be
- /// treated the same as other instructions. Otherwise, use
- /// \p applyMergedLocation instead.
- static const DILocation *getMergedLocation(const DILocation *LocA,
- const DILocation *LocB) {
- if (LocA && LocB && (LocA == LocB || !LocA->canDiscriminate(*LocB)))
- return LocA;
- return nullptr;
- }
+ /// \p ForInst: The Instruction the merged DILocation is for. If the
+ /// Instruction is unavailable or non-existent, use nullptr.
+ static const DILocation *
+ getMergedLocation(const DILocation *LocA, const DILocation *LocB,
+ const Instruction *ForInst = nullptr);
/// Returns the base discriminator for a given encoded discriminator \p D.
static unsigned getBaseDiscriminatorFromDiscriminator(unsigned D) {
@@ -2310,9 +2306,11 @@ public:
///
/// \param OffsetInBits Offset of the piece in bits.
/// \param SizeInBits Size of the piece in bits.
- static DIExpression *createFragmentExpression(const DIExpression *Exp,
- unsigned OffsetInBits,
- unsigned SizeInBits);
+ /// \return Creating a fragment expression may fail if \c Expr
+ /// contains arithmetic operations that would be truncated.
+ static Optional<DIExpression *>
+ createFragmentExpression(const DIExpression *Expr, unsigned OffsetInBits,
+ unsigned SizeInBits);
};
/// Global variables.
diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index 66b1e7e01fe..41f379b87c2 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -308,10 +308,15 @@ public:
/// Determine whether the exact flag is set.
bool isExact() const;
- /// Set or clear the unsafe-algebra flag on this instruction, which must be an
+ /// Set or clear all fast-math-flags on this instruction, which must be an
/// operator which supports this flag. See LangRef.html for the meaning of
/// this flag.
- void setHasUnsafeAlgebra(bool B);
+ void setFast(bool B);
+
+ /// Set or clear the reassociation flag on this instruction, which must be
+ /// an operator which supports this flag. See LangRef.html for the meaning of
+ /// this flag.
+ void setHasAllowReassoc(bool B);
/// Set or clear the no-nans flag on this instruction, which must be an
/// operator which supports this flag. See LangRef.html for the meaning of
@@ -333,6 +338,11 @@ public:
/// this flag.
void setHasAllowReciprocal(bool B);
+ /// Set or clear the approximate-math-functions flag on this instruction,
+ /// which must be an operator which supports this flag. See LangRef.html for
+ /// the meaning of this flag.
+ void setHasApproxFunc(bool B);
+
/// Convenience function for setting multiple fast-math flags on this
/// instruction, which must be an operator which supports these flags. See
/// LangRef.html for the meaning of these flags.
@@ -343,8 +353,11 @@ public:
/// LangRef.html for the meaning of these flags.
void copyFastMathFlags(FastMathFlags FMF);
- /// Determine whether the unsafe-algebra flag is set.
- bool hasUnsafeAlgebra() const;
+ /// Determine whether all fast-math-flags are set.
+ bool isFast() const;
+
+ /// Determine whether the allow-reassociation flag is set.
+ bool hasAllowReassoc() const;
/// Determine whether the no-NaNs flag is set.
bool hasNoNaNs() const;
@@ -361,6 +374,9 @@ public:
/// Determine whether the allow-contract flag is set.
bool hasAllowContract() const;
+ /// Determine whether the approximate-math-functions flag is set.
+ bool hasApproxFunc() const;
+
/// Convenience function for getting all the fast-math flags, which must be an
/// operator which supports these flags. See LangRef.html for the meaning of
/// these flags.
diff --git a/include/llvm/IR/IntrinsicsNVVM.td b/include/llvm/IR/IntrinsicsNVVM.td
index 7ba1a3eb2e5..249419d15d3 100644
--- a/include/llvm/IR/IntrinsicsNVVM.td
+++ b/include/llvm/IR/IntrinsicsNVVM.td
@@ -683,10 +683,15 @@ let TargetPrefix = "nvvm" in {
Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
-// Atomic not available as an llvm intrinsic.
+// Atomics not available as llvm intrinsics.
def int_nvvm_atomic_load_add_f32 : Intrinsic<[llvm_float_ty],
[LLVMAnyPointerType<llvm_float_ty>, llvm_float_ty],
[IntrArgMemOnly, NoCapture<0>]>;
+ // Atomic add of f64 requires sm_60.
+ def int_nvvm_atomic_load_add_f64 : Intrinsic<[llvm_double_ty],
+ [LLVMAnyPointerType<llvm_double_ty>, llvm_double_ty],
+ [IntrArgMemOnly, NoCapture<0>]>;
+
def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty],
[LLVMAnyPointerType<llvm_i32_ty>, llvm_i32_ty],
[IntrArgMemOnly, NoCapture<0>]>;
diff --git a/include/llvm/IR/LLVMContext.h b/include/llvm/IR/LLVMContext.h
index 9e935823c77..a95634d32c2 100644
--- a/include/llvm/IR/LLVMContext.h
+++ b/include/llvm/IR/LLVMContext.h
@@ -101,6 +101,7 @@ public:
MD_absolute_symbol = 21, // "absolute_symbol"
MD_associated = 22, // "associated"
MD_callees = 23, // "callees"
+ MD_irr_loop = 24, // "irr_loop"
};
/// Known operand bundle tag IDs, which always have the same value. All
diff --git a/include/llvm/IR/MDBuilder.h b/include/llvm/IR/MDBuilder.h
index d679cef95b6..15c1b9cb60e 100644
--- a/include/llvm/IR/MDBuilder.h
+++ b/include/llvm/IR/MDBuilder.h
@@ -173,6 +173,9 @@ public:
/// base type, access type and offset relative to the base type.
MDNode *createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType,
uint64_t Offset, bool IsConstant = false);
+
+ /// \brief Return metadata containing an irreducible loop header weight.
+ MDNode *createIrrLoopHeaderWeight(uint64_t Weight);
};
} // end namespace llvm
diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h
index 2d664f41e3c..b1e58a2a0d9 100644
--- a/include/llvm/IR/ModuleSummaryIndex.h
+++ b/include/llvm/IR/ModuleSummaryIndex.h
@@ -148,11 +148,15 @@ public:
/// In combined summary, indicate that the global value is live.
unsigned Live : 1;
+ /// Indicates that the linker resolved the symbol to a definition from
+ /// within the same linkage unit.
+ unsigned DSOLocal : 1;
+
/// Convenience Constructors
explicit GVFlags(GlobalValue::LinkageTypes Linkage,
- bool NotEligibleToImport, bool Live)
+ bool NotEligibleToImport, bool Live, bool IsLocal)
: Linkage(Linkage), NotEligibleToImport(NotEligibleToImport),
- Live(Live) {}
+ Live(Live), DSOLocal(IsLocal) {}
};
private:
@@ -229,6 +233,10 @@ public:
void setLive(bool Live) { Flags.Live = Live; }
+ void setDSOLocal(bool Local) { Flags.DSOLocal = Local; }
+
+ bool isDSOLocal() const { return Flags.DSOLocal; }
+
/// Flag that this global value cannot be imported.
void setNotEligibleToImport() { Flags.NotEligibleToImport = true; }
diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h
index 2f9990ca03d..4687f2d53e7 100644
--- a/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -135,7 +135,7 @@ template <> struct MappingTraits<TypeIdSummary> {
struct FunctionSummaryYaml {
unsigned Linkage;
- bool NotEligibleToImport, Live;
+ bool NotEligibleToImport, Live, IsLocal;
std::vector<uint64_t> TypeTests;
std::vector<FunctionSummary::VFuncId> TypeTestAssumeVCalls,
TypeCheckedLoadVCalls;
@@ -177,6 +177,7 @@ template <> struct MappingTraits<FunctionSummaryYaml> {
io.mapOptional("Linkage", summary.Linkage);
io.mapOptional("NotEligibleToImport", summary.NotEligibleToImport);
io.mapOptional("Live", summary.Live);
+ io.mapOptional("Local", summary.IsLocal);
io.mapOptional("TypeTests", summary.TypeTests);
io.mapOptional("TypeTestAssumeVCalls", summary.TypeTestAssumeVCalls);
io.mapOptional("TypeCheckedLoadVCalls", summary.TypeCheckedLoadVCalls);
@@ -211,7 +212,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
Elem.SummaryList.push_back(llvm::make_unique<FunctionSummary>(
GlobalValueSummary::GVFlags(
static_cast<GlobalValue::LinkageTypes>(FSum.Linkage),
- FSum.NotEligibleToImport, FSum.Live),
+ FSum.NotEligibleToImport, FSum.Live, FSum.IsLocal),
0, FunctionSummary::FFlags{}, ArrayRef<ValueInfo>{},
ArrayRef<FunctionSummary::EdgeTy>{}, std::move(FSum.TypeTests),
std::move(FSum.TypeTestAssumeVCalls),
@@ -228,7 +229,8 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
FSums.push_back(FunctionSummaryYaml{
FSum->flags().Linkage,
static_cast<bool>(FSum->flags().NotEligibleToImport),
- static_cast<bool>(FSum->flags().Live), FSum->type_tests(),
+ static_cast<bool>(FSum->flags().Live),
+ static_cast<bool>(FSum->flags().DSOLocal), FSum->type_tests(),
FSum->type_test_assume_vcalls(), FSum->type_checked_load_vcalls(),
FSum->type_test_assume_const_vcalls(),
FSum->type_checked_load_const_vcalls()});
diff --git a/include/llvm/IR/Operator.h b/include/llvm/IR/Operator.h
index ae9255174a3..01746e4b6a2 100644
--- a/include/llvm/IR/Operator.h
+++ b/include/llvm/IR/Operator.h
@@ -163,52 +163,61 @@ private:
unsigned Flags = 0;
- FastMathFlags(unsigned F) : Flags(F) { }
+ FastMathFlags(unsigned F) {
+ // If all 7 bits are set, turn this into -1. If the number of bits grows,
+ // this must be updated. This is intended to provide some forward binary
+ // compatibility insurance for the meaning of 'fast' in case bits are added.
+ if (F == 0x7F) Flags = ~0U;
+ else Flags = F;
+ }
public:
- /// This is how the bits are used in Value::SubclassOptionalData so they
- /// should fit there too.
+ // This is how the bits are used in Value::SubclassOptionalData so they
+ // should fit there too.
+ // WARNING: We're out of space. SubclassOptionalData only has 7 bits. New
+ // functionality will require a change in how this information is stored.
enum {
- UnsafeAlgebra = (1 << 0),
+ AllowReassoc = (1 << 0),
NoNaNs = (1 << 1),
NoInfs = (1 << 2),
NoSignedZeros = (1 << 3),
AllowReciprocal = (1 << 4),
- AllowContract = (1 << 5)
+ AllowContract = (1 << 5),
+ ApproxFunc = (1 << 6)
};
FastMathFlags() = default;
- /// Whether any flag is set
bool any() const { return Flags != 0; }
+ bool none() const { return Flags == 0; }
+ bool all() const { return Flags == ~0U; }
- /// Set all the flags to false
void clear() { Flags = 0; }
+ void set() { Flags = ~0U; }
/// Flag queries
+ bool allowReassoc() const { return 0 != (Flags & AllowReassoc); }
bool noNaNs() const { return 0 != (Flags & NoNaNs); }
bool noInfs() const { return 0 != (Flags & NoInfs); }
bool noSignedZeros() const { return 0 != (Flags & NoSignedZeros); }
bool allowReciprocal() const { return 0 != (Flags & AllowReciprocal); }
- bool allowContract() const { return 0 != (Flags & AllowContract); }
- bool unsafeAlgebra() const { return 0 != (Flags & UnsafeAlgebra); }
+ bool allowContract() const { return 0 != (Flags & AllowContract); }
+ bool approxFunc() const { return 0 != (Flags & ApproxFunc); }
+ /// 'Fast' means all bits are set.
+ bool isFast() const { return all(); }
/// Flag setters
+ void setAllowReassoc() { Flags |= AllowReassoc; }
void setNoNaNs() { Flags |= NoNaNs; }
void setNoInfs() { Flags |= NoInfs; }
void setNoSignedZeros() { Flags |= NoSignedZeros; }
void setAllowReciprocal() { Flags |= AllowReciprocal; }
+ // TODO: Change the other set* functions to take a parameter?
void setAllowContract(bool B) {
Flags = (Flags & ~AllowContract) | B * AllowContract;
}
- void setUnsafeAlgebra() {
- Flags |= UnsafeAlgebra;
- setNoNaNs();
- setNoInfs();
- setNoSignedZeros();
- setAllowReciprocal();
- setAllowContract(true);
- }
+ void setApproxFunc() { Flags |= ApproxFunc; }
+ void setFast() { set(); }
void operator&=(const FastMathFlags &OtherFlags) {
Flags &= OtherFlags.Flags;
@@ -221,18 +230,21 @@ class FPMathOperator : public Operator {
private:
friend class Instruction;
- void setHasUnsafeAlgebra(bool B) {
+ /// 'Fast' means all bits are set.
+ void setFast(bool B) {
+ setHasAllowReassoc(B);
+ setHasNoNaNs(B);
+ setHasNoInfs(B);
+ setHasNoSignedZeros(B);
+ setHasAllowReciprocal(B);
+ setHasAllowContract(B);
+ setHasApproxFunc(B);
+ }
+
+ void setHasAllowReassoc(bool B) {
SubclassOptionalData =
- (SubclassOptionalData & ~FastMathFlags::UnsafeAlgebra) |
- (B * FastMathFlags::UnsafeAlgebra);
-
- // Unsafe algebra implies all the others
- if (B) {
- setHasNoNaNs(true);
- setHasNoInfs(true);
- setHasNoSignedZeros(true);
- setHasAllowReciprocal(true);
- }
+ (SubclassOptionalData & ~FastMathFlags::AllowReassoc) |
+ (B * FastMathFlags::AllowReassoc);
}
void setHasNoNaNs(bool B) {
@@ -265,6 +277,12 @@ private:
(B * FastMathFlags::AllowContract);
}
+ void setHasApproxFunc(bool B) {
+ SubclassOptionalData =
+ (SubclassOptionalData & ~FastMathFlags::ApproxFunc) |
+ (B * FastMathFlags::ApproxFunc);
+ }
+
/// Convenience function for setting multiple fast-math flags.
/// FMF is a mask of the bits to set.
void setFastMathFlags(FastMathFlags FMF) {
@@ -278,42 +296,53 @@ private:
}
public:
- /// Test whether this operation is permitted to be
- /// algebraically transformed, aka the 'A' fast-math property.
- bool hasUnsafeAlgebra() const {
- return (SubclassOptionalData & FastMathFlags::UnsafeAlgebra) != 0;
+ /// Test if this operation allows all non-strict floating-point transforms.
+ bool isFast() const {
+ return ((SubclassOptionalData & FastMathFlags::AllowReassoc) != 0 &&
+ (SubclassOptionalData & FastMathFlags::NoNaNs) != 0 &&
+ (SubclassOptionalData & FastMathFlags::NoInfs) != 0 &&
+ (SubclassOptionalData & FastMathFlags::NoSignedZeros) != 0 &&
+ (SubclassOptionalData & FastMathFlags::AllowReciprocal) != 0 &&
+ (SubclassOptionalData & FastMathFlags::AllowContract) != 0 &&
+ (SubclassOptionalData & FastMathFlags::ApproxFunc) != 0);
+ }
+
+ /// Test if this operation may be simplified with reassociative transforms.
+ bool hasAllowReassoc() const {
+ return (SubclassOptionalData & FastMathFlags::AllowReassoc) != 0;
}
- /// Test whether this operation's arguments and results are to be
- /// treated as non-NaN, aka the 'N' fast-math property.
+ /// Test if this operation's arguments and results are assumed not-NaN.
bool hasNoNaNs() const {
return (SubclassOptionalData & FastMathFlags::NoNaNs) != 0;
}
- /// Test whether this operation's arguments and results are to be
- /// treated as NoN-Inf, aka the 'I' fast-math property.
+ /// Test if this operation's arguments and results are assumed not-infinite.
bool hasNoInfs() const {
return (SubclassOptionalData & FastMathFlags::NoInfs) != 0;
}
- /// Test whether this operation can treat the sign of zero
- /// as insignificant, aka the 'S' fast-math property.
+ /// Test if this operation can ignore the sign of zero.
bool hasNoSignedZeros() const {
return (SubclassOptionalData & FastMathFlags::NoSignedZeros) != 0;
}
- /// Test whether this operation is permitted to use
- /// reciprocal instead of division, aka the 'R' fast-math property.
+ /// Test if this operation can use reciprocal multiply instead of division.
bool hasAllowReciprocal() const {
return (SubclassOptionalData & FastMathFlags::AllowReciprocal) != 0;
}
- /// Test whether this operation is permitted to
- /// be floating-point contracted.
+ /// Test if this operation can be floating-point contracted (FMA).
bool hasAllowContract() const {
return (SubclassOptionalData & FastMathFlags::AllowContract) != 0;
}
+ /// Test if this operation allows approximations of math library functions or
+ /// intrinsics.
+ bool hasApproxFunc() const {
+ return (SubclassOptionalData & FastMathFlags::ApproxFunc) != 0;
+ }
+
/// Convenience function for getting all the fast-math flags
FastMathFlags getFastMathFlags() const {
return FastMathFlags(SubclassOptionalData);
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index 9e4914973ed..f50f0172685 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -299,6 +299,12 @@ public:
/// values or constant users.
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB);
+ /// replaceUsesExceptBlockAddr - Go through the uses list for this definition
+ /// and make each use point to "V" instead of "this" when the use is outside
+ /// the block. 'This's use list is expected to have at least one element.
+ /// Unlike replaceAllUsesWith this function skips blockaddr uses.
+ void replaceUsesExceptBlockAddr(Value *New);
+
//----------------------------------------------------------------------
// Methods for handling the chain of uses of this Value.
//
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index c3ad8fe41af..7616534d8d5 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -80,11 +80,13 @@ void initializeBranchFolderPassPass(PassRegistry&);
void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry&);
void initializeBranchRelaxationPass(PassRegistry&);
void initializeBreakCriticalEdgesPass(PassRegistry&);
+void initializeCallSiteSplittingLegacyPassPass(PassRegistry&);
void initializeCFGOnlyPrinterLegacyPassPass(PassRegistry&);
void initializeCFGOnlyViewerLegacyPassPass(PassRegistry&);
void initializeCFGPrinterLegacyPassPass(PassRegistry&);
void initializeCFGSimplifyPassPass(PassRegistry&);
void initializeCFGViewerLegacyPassPass(PassRegistry&);
+void initializeCFIInstrInserterPass(PassRegistry&);
void initializeCFLAndersAAWrapperPassPass(PassRegistry&);
void initializeCFLSteensAAWrapperPassPass(PassRegistry&);
void initializeCallGraphDOTPrinterPass(PassRegistry&);
@@ -128,6 +130,7 @@ void initializeEdgeBundlesPass(PassRegistry&);
void initializeEfficiencySanitizerPass(PassRegistry&);
void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&);
void initializeExpandISelPseudosPass(PassRegistry&);
+void initializeExpandMemCmpPassPass(PassRegistry&);
void initializeExpandPostRAPass(PassRegistry&);
void initializeExpandReductionsPass(PassRegistry&);
void initializeExternalAAWrapperPassPass(PassRegistry&);
@@ -377,6 +380,7 @@ void initializeWinEHPreparePass(PassRegistry&);
void initializeWriteBitcodePassPass(PassRegistry&);
void initializeWriteThinLTOBitcodePass(PassRegistry&);
void initializeXRayInstrumentationPass(PassRegistry&);
+void initializeMIRCanonicalizerPass(PassRegistry &);
} // end namespace llvm
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 765e63926da..ce70f53ccb0 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -180,6 +180,7 @@ namespace {
(void) llvm::createReversePostOrderFunctionAttrsPass();
(void) llvm::createMergeFunctionsPass();
(void) llvm::createMergeICmpsPass();
+ (void) llvm::createExpandMemCmpPass();
std::string buf;
llvm::raw_string_ostream os(buf);
(void) llvm::createPrintModulePass(os);
diff --git a/include/llvm/MC/MCFragment.h b/include/llvm/MC/MCFragment.h
index 7c66b2126cd..3d73a027903 100644
--- a/include/llvm/MC/MCFragment.h
+++ b/include/llvm/MC/MCFragment.h
@@ -42,7 +42,7 @@ public:
FT_DwarfFrame,
FT_LEB,
FT_Padding,
- FT_SafeSEH,
+ FT_SymbolId,
FT_CVInlineLines,
FT_CVDefRange,
FT_Dummy
@@ -562,12 +562,13 @@ public:
}
};
-class MCSafeSEHFragment : public MCFragment {
+/// Represents a symbol table index fragment.
+class MCSymbolIdFragment : public MCFragment {
const MCSymbol *Sym;
public:
- MCSafeSEHFragment(const MCSymbol *Sym, MCSection *Sec = nullptr)
- : MCFragment(FT_SafeSEH, false, 0, Sec), Sym(Sym) {}
+ MCSymbolIdFragment(const MCSymbol *Sym, MCSection *Sec = nullptr)
+ : MCFragment(FT_SymbolId, false, 0, Sec), Sym(Sym) {}
/// \name Accessors
/// @{
@@ -578,7 +579,7 @@ public:
/// @}
static bool classof(const MCFragment *F) {
- return F->getKind() == MCFragment::FT_SafeSEH;
+ return F->getKind() == MCFragment::FT_SymbolId;
}
};
diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index 92fb46e8e93..c24b6310465 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h
@@ -205,6 +205,46 @@ getExtendedSymbolTableIndex(const typename ELFT::Sym *Sym,
}
template <class ELFT>
+Expected<uint32_t>
+ELFFile<ELFT>::getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms,
+ ArrayRef<Elf_Word> ShndxTable) const {
+ uint32_t Index = Sym->st_shndx;
+ if (Index == ELF::SHN_XINDEX) {
+ auto ErrorOrIndex = getExtendedSymbolTableIndex<ELFT>(
+ Sym, Syms.begin(), ShndxTable);
+ if (!ErrorOrIndex)
+ return ErrorOrIndex.takeError();
+ return *ErrorOrIndex;
+ }
+ if (Index == ELF::SHN_UNDEF || Index >= ELF::SHN_LORESERVE)
+ return 0;
+ return Index;
+}
+
+template <class ELFT>
+Expected<const typename ELFT::Shdr *>
+ELFFile<ELFT>::getSection(const Elf_Sym *Sym, const Elf_Shdr *SymTab,
+ ArrayRef<Elf_Word> ShndxTable) const {
+ auto SymsOrErr = symbols(SymTab);
+ if (!SymsOrErr)
+ return SymsOrErr.takeError();
+ return getSection(Sym, *SymsOrErr, ShndxTable);
+}
+
+template <class ELFT>
+Expected<const typename ELFT::Shdr *>
+ELFFile<ELFT>::getSection(const Elf_Sym *Sym, Elf_Sym_Range Symbols,
+ ArrayRef<Elf_Word> ShndxTable) const {
+ auto IndexOrErr = getSectionIndex(Sym, Symbols, ShndxTable);
+ if (!IndexOrErr)
+ return IndexOrErr.takeError();
+ uint32_t Index = *IndexOrErr;
+ if (Index == 0)
+ return nullptr;
+ return getSection(Index);
+}
+
+template <class ELFT>
inline Expected<const typename ELFT::Sym *>
getSymbol(typename ELFT::SymRange Symbols, uint32_t Index) {
if (Index >= Symbols.size())
@@ -213,6 +253,15 @@ getSymbol(typename ELFT::SymRange Symbols, uint32_t Index) {
}
template <class ELFT>
+Expected<const typename ELFT::Sym *>
+ELFFile<ELFT>::getSymbol(const Elf_Shdr *Sec, uint32_t Index) const {
+ auto SymtabOrErr = symbols(Sec);
+ if (!SymtabOrErr)
+ return SymtabOrErr.takeError();
+ return object::getSymbol<ELFT>(*SymtabOrErr, Index);
+}
+
+template <class ELFT>
template <typename T>
Expected<ArrayRef<T>>
ELFFile<ELFT>::getSectionContentsAsArray(const Elf_Shdr *Sec) const {
@@ -233,6 +282,119 @@ ELFFile<ELFT>::getSectionContentsAsArray(const Elf_Shdr *Sec) const {
}
template <class ELFT>
+Expected<ArrayRef<uint8_t>>
+ELFFile<ELFT>::getSectionContents(const Elf_Shdr *Sec) const {
+ return getSectionContentsAsArray<uint8_t>(Sec);
+}
+
+template <class ELFT>
+StringRef ELFFile<ELFT>::getRelocationTypeName(uint32_t Type) const {
+ return getELFRelocationTypeName(getHeader()->e_machine, Type);
+}
+
+template <class ELFT>
+void ELFFile<ELFT>::getRelocationTypeName(uint32_t Type,
+ SmallVectorImpl<char> &Result) const {
+ if (!isMipsELF64()) {
+ StringRef Name = getRelocationTypeName(Type);
+ Result.append(Name.begin(), Name.end());
+ } else {
+ // The Mips N64 ABI allows up to three operations to be specified per
+ // relocation record. Unfortunately there's no easy way to test for the
+ // presence of N64 ELFs as they have no special flag that identifies them
+ // as being N64. We can safely assume at the moment that all Mips
+ // ELFCLASS64 ELFs are N64. New Mips64 ABIs should provide enough
+ // information to disambiguate between old vs new ABIs.
+ uint8_t Type1 = (Type >> 0) & 0xFF;
+ uint8_t Type2 = (Type >> 8) & 0xFF;
+ uint8_t Type3 = (Type >> 16) & 0xFF;
+
+ // Concat all three relocation type names.
+ StringRef Name = getRelocationTypeName(Type1);
+ Result.append(Name.begin(), Name.end());
+
+ Name = getRelocationTypeName(Type2);
+ Result.append(1, '/');
+ Result.append(Name.begin(), Name.end());
+
+ Name = getRelocationTypeName(Type3);
+ Result.append(1, '/');
+ Result.append(Name.begin(), Name.end());
+ }
+}
+
+template <class ELFT>
+Expected<const typename ELFT::Sym *>
+ELFFile<ELFT>::getRelocationSymbol(const Elf_Rel *Rel,
+ const Elf_Shdr *SymTab) const {
+ uint32_t Index = Rel->getSymbol(isMips64EL());
+ if (Index == 0)
+ return nullptr;
+ return getEntry<Elf_Sym>(SymTab, Index);
+}
+
+template <class ELFT>
+Expected<StringRef>
+ELFFile<ELFT>::getSectionStringTable(Elf_Shdr_Range Sections) const {
+ uint32_t Index = getHeader()->e_shstrndx;
+ if (Index == ELF::SHN_XINDEX)
+ Index = Sections[0].sh_link;
+
+ if (!Index) // no section string table.
+ return "";
+ if (Index >= Sections.size())
+ return createError("invalid section index");
+ return getStringTable(&Sections[Index]);
+}
+
+template <class ELFT> ELFFile<ELFT>::ELFFile(StringRef Object) : Buf(Object) {}
+
+template <class ELFT>
+Expected<ELFFile<ELFT>> ELFFile<ELFT>::create(StringRef Object) {
+ if (sizeof(Elf_Ehdr) > Object.size())
+ return createError("Invalid buffer");
+ return ELFFile(Object);
+}
+
+template <class ELFT>
+Expected<typename ELFT::ShdrRange> ELFFile<ELFT>::sections() const {
+ const uintX_t SectionTableOffset = getHeader()->e_shoff;
+ if (SectionTableOffset == 0)
+ return ArrayRef<Elf_Shdr>();
+
+ if (getHeader()->e_shentsize != sizeof(Elf_Shdr))
+ return createError(
+ "invalid section header entry size (e_shentsize) in ELF header");
+
+ const uint64_t FileSize = Buf.size();
+
+ if (SectionTableOffset + sizeof(Elf_Shdr) > FileSize)
+ return createError("section header table goes past the end of the file");
+
+ // Invalid address alignment of section headers
+ if (SectionTableOffset & (alignof(Elf_Shdr) - 1))
+ return createError("invalid alignment of section headers");
+
+ const Elf_Shdr *First =
+ reinterpret_cast<const Elf_Shdr *>(base() + SectionTableOffset);
+
+ uintX_t NumSections = getHeader()->e_shnum;
+ if (NumSections == 0)
+ NumSections = First->sh_size;
+
+ if (NumSections > UINT64_MAX / sizeof(Elf_Shdr))
+ return createError("section table goes past the end of file");
+
+ const uint64_t SectionTableSize = NumSections * sizeof(Elf_Shdr);
+
+ // Section table goes past end of file!
+ if (SectionTableOffset + SectionTableSize > FileSize)
+ return createError("section table goes past the end of file");
+
+ return makeArrayRef(First, NumSections);
+}
+
+template <class ELFT>
template <typename T>
Expected<const T *> ELFFile<ELFT>::getEntry(uint32_t Section,
uint32_t Entry) const {
@@ -254,6 +416,107 @@ Expected<const T *> ELFFile<ELFT>::getEntry(const Elf_Shdr *Section,
return reinterpret_cast<const T *>(base() + Pos);
}
+template <class ELFT>
+Expected<const typename ELFT::Shdr *>
+ELFFile<ELFT>::getSection(uint32_t Index) const {
+ auto TableOrErr = sections();
+ if (!TableOrErr)
+ return TableOrErr.takeError();
+ return object::getSection<ELFT>(*TableOrErr, Index);
+}
+
+template <class ELFT>
+Expected<StringRef>
+ELFFile<ELFT>::getStringTable(const Elf_Shdr *Section) const {
+ if (Section->sh_type != ELF::SHT_STRTAB)
+ return createError("invalid sh_type for string table, expected SHT_STRTAB");
+ auto V = getSectionContentsAsArray<char>(Section);
+ if (!V)
+ return V.takeError();
+ ArrayRef<char> Data = *V;
+ if (Data.empty())
+ return createError("empty string table");
+ if (Data.back() != '\0')
+ return createError("string table non-null terminated");
+ return StringRef(Data.begin(), Data.size());
+}
+
+template <class ELFT>
+Expected<ArrayRef<typename ELFT::Word>>
+ELFFile<ELFT>::getSHNDXTable(const Elf_Shdr &Section) const {
+ auto SectionsOrErr = sections();
+ if (!SectionsOrErr)
+ return SectionsOrErr.takeError();
+ return getSHNDXTable(Section, *SectionsOrErr);
+}
+
+template <class ELFT>
+Expected<ArrayRef<typename ELFT::Word>>
+ELFFile<ELFT>::getSHNDXTable(const Elf_Shdr &Section,
+ Elf_Shdr_Range Sections) const {
+ assert(Section.sh_type == ELF::SHT_SYMTAB_SHNDX);
+ auto VOrErr = getSectionContentsAsArray<Elf_Word>(&Section);
+ if (!VOrErr)
+ return VOrErr.takeError();
+ ArrayRef<Elf_Word> V = *VOrErr;
+ auto SymTableOrErr = object::getSection<ELFT>(Sections, Section.sh_link);
+ if (!SymTableOrErr)
+ return SymTableOrErr.takeError();
+ const Elf_Shdr &SymTable = **SymTableOrErr;
+ if (SymTable.sh_type != ELF::SHT_SYMTAB &&
+ SymTable.sh_type != ELF::SHT_DYNSYM)
+ return createError("invalid sh_type");
+ if (V.size() != (SymTable.sh_size / sizeof(Elf_Sym)))
+ return createError("invalid section contents size");
+ return V;
+}
+
+template <class ELFT>
+Expected<StringRef>
+ELFFile<ELFT>::getStringTableForSymtab(const Elf_Shdr &Sec) const {
+ auto SectionsOrErr = sections();
+ if (!SectionsOrErr)
+ return SectionsOrErr.takeError();
+ return getStringTableForSymtab(Sec, *SectionsOrErr);
+}
+
+template <class ELFT>
+Expected<StringRef>
+ELFFile<ELFT>::getStringTableForSymtab(const Elf_Shdr &Sec,
+ Elf_Shdr_Range Sections) const {
+
+ if (Sec.sh_type != ELF::SHT_SYMTAB && Sec.sh_type != ELF::SHT_DYNSYM)
+ return createError(
+ "invalid sh_type for symbol table, expected SHT_SYMTAB or SHT_DYNSYM");
+ auto SectionOrErr = object::getSection<ELFT>(Sections, Sec.sh_link);
+ if (!SectionOrErr)
+ return SectionOrErr.takeError();
+ return getStringTable(*SectionOrErr);
+}
+
+template <class ELFT>
+Expected<StringRef>
+ELFFile<ELFT>::getSectionName(const Elf_Shdr *Section) const {
+ auto SectionsOrErr = sections();
+ if (!SectionsOrErr)
+ return SectionsOrErr.takeError();
+ auto Table = getSectionStringTable(*SectionsOrErr);
+ if (!Table)
+ return Table.takeError();
+ return getSectionName(Section, *Table);
+}
+
+template <class ELFT>
+Expected<StringRef> ELFFile<ELFT>::getSectionName(const Elf_Shdr *Section,
+ StringRef DotShstrtab) const {
+ uint32_t Offset = Section->sh_name;
+ if (Offset == 0)
+ return StringRef();
+ if (Offset >= DotShstrtab.size())
+ return createError("invalid string offset");
+ return StringRef(DotShstrtab.data() + Offset);
+}
+
/// This function returns the hash value for a symbol in the .dynsym section
/// Name of the API remains consistent as specified in the libelf
/// REF : http://www.sco.com/developers/gabi/latest/ch5.dynamic.html#hash
diff --git a/include/llvm/ObjectYAML/COFFYAML.h b/include/llvm/ObjectYAML/COFFYAML.h
index bbceefac3d9..1fce46c125f 100644
--- a/include/llvm/ObjectYAML/COFFYAML.h
+++ b/include/llvm/ObjectYAML/COFFYAML.h
@@ -158,6 +158,16 @@ struct ScalarEnumerationTraits<COFF::RelocationTypeAMD64> {
};
template <>
+struct ScalarEnumerationTraits<COFF::RelocationTypesARM> {
+ static void enumeration(IO &IO, COFF::RelocationTypesARM &Value);
+};
+
+template <>
+struct ScalarEnumerationTraits<COFF::RelocationTypesARM64> {
+ static void enumeration(IO &IO, COFF::RelocationTypesARM64 &Value);
+};
+
+template <>
struct ScalarEnumerationTraits<COFF::WindowsSubsystem> {
static void enumeration(IO &IO, COFF::WindowsSubsystem &Value);
};
diff --git a/include/llvm/Support/GCOV.h b/include/llvm/ProfileData/GCOV.h
index 02016e7dbd6..497f80b87b2 100644
--- a/include/llvm/Support/GCOV.h
+++ b/include/llvm/ProfileData/GCOV.h
@@ -12,8 +12,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_SUPPORT_GCOV_H
-#define LLVM_SUPPORT_GCOV_H
+#ifndef LLVM_PROFILEDATA_GCOV_H
+#define LLVM_PROFILEDATA_GCOV_H
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
diff --git a/include/llvm/ProfileData/SampleProfReader.h b/include/llvm/ProfileData/SampleProfReader.h
index 9c1f357cbbd..0e9ab2dc60e 100644
--- a/include/llvm/ProfileData/SampleProfReader.h
+++ b/include/llvm/ProfileData/SampleProfReader.h
@@ -217,10 +217,10 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/ProfileSummary.h"
+#include "llvm/ProfileData/GCOV.h"
#include "llvm/ProfileData/SampleProf.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/GCOV.h"
#include "llvm/Support/MemoryBuffer.h"
#include <algorithm>
#include <cstdint>
diff --git a/include/llvm/Support/CMakeLists.txt b/include/llvm/Support/CMakeLists.txt
index 6104382c3e4..bf662c77351 100644
--- a/include/llvm/Support/CMakeLists.txt
+++ b/include/llvm/Support/CMakeLists.txt
@@ -40,3 +40,4 @@ set_source_files_properties("${version_inc}"
HEADER_FILE_ONLY TRUE)
add_custom_target(llvm_vcsrevision_h DEPENDS "${version_inc}")
+set_target_properties(llvm_vcsrevision_h PROPERTIES FOLDER "Misc")
diff --git a/include/llvm/Support/FileOutputBuffer.h b/include/llvm/Support/FileOutputBuffer.h
index 8db64098c36..6aed423a01e 100644
--- a/include/llvm/Support/FileOutputBuffer.h
+++ b/include/llvm/Support/FileOutputBuffer.h
@@ -17,7 +17,7 @@
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
namespace llvm {
@@ -37,7 +37,7 @@ public:
/// Factory method to create an OutputBuffer object which manages a read/write
/// buffer of the specified size. When committed, the buffer will be written
/// to the file at the specified path.
- static ErrorOr<std::unique_ptr<FileOutputBuffer>>
+ static Expected<std::unique_ptr<FileOutputBuffer>>
create(StringRef FilePath, size_t Size, unsigned Flags = 0);
/// Returns a pointer to the start of the buffer.
@@ -57,7 +57,7 @@ public:
/// is called, the file is deleted in the destructor. The optional parameter
/// is used if it turns out you want the file size to be smaller than
/// initially requested.
- virtual std::error_code commit() = 0;
+ virtual Error commit() = 0;
/// If this object was previously committed, the destructor just deletes
/// this object. If this object was not committed, the destructor
diff --git a/include/llvm/Support/LowLevelTypeImpl.h b/include/llvm/Support/LowLevelTypeImpl.h
index c79dd0c2950..099fa461899 100644
--- a/include/llvm/Support/LowLevelTypeImpl.h
+++ b/include/llvm/Support/LowLevelTypeImpl.h
@@ -137,51 +137,6 @@ public:
return scalar(getScalarSizeInBits());
}
- /// Get a low-level type with half the size of the original, by halving the
- /// size of the scalar type involved. For example `s32` will become `s16`,
- /// `<2 x s32>` will become `<2 x s16>`.
- LLT halfScalarSize() const {
- assert(!IsPointer && getScalarSizeInBits() > 1 &&
- getScalarSizeInBits() % 2 == 0 && "cannot half size of this type");
- return LLT{/*isPointer=*/false, IsVector ? true : false,
- IsVector ? getNumElements() : (uint16_t)0,
- getScalarSizeInBits() / 2, /*AddressSpace=*/0};
- }
-
- /// Get a low-level type with twice the size of the original, by doubling the
- /// size of the scalar type involved. For example `s32` will become `s64`,
- /// `<2 x s32>` will become `<2 x s64>`.
- LLT doubleScalarSize() const {
- assert(!IsPointer && "cannot change size of this type");
- return LLT{/*isPointer=*/false, IsVector ? true : false,
- IsVector ? getNumElements() : (uint16_t)0,
- getScalarSizeInBits() * 2, /*AddressSpace=*/0};
- }
-
- /// Get a low-level type with half the size of the original, by halving the
- /// number of vector elements of the scalar type involved. The source must be
- /// a vector type with an even number of elements. For example `<4 x s32>`
- /// will become `<2 x s32>`, `<2 x s32>` will become `s32`.
- LLT halfElements() const {
- assert(isVector() && getNumElements() % 2 == 0 && "cannot half odd vector");
- if (getNumElements() == 2)
- return scalar(getScalarSizeInBits());
-
- return LLT{/*isPointer=*/false, /*isVector=*/true,
- (uint16_t)(getNumElements() / 2), getScalarSizeInBits(),
- /*AddressSpace=*/0};
- }
-
- /// Get a low-level type with twice the size of the original, by doubling the
- /// number of vector elements of the scalar type involved. The source must be
- /// a vector type. For example `<2 x s32>` will become `<4 x s32>`. Doubling
- /// the number of elements in sN produces <2 x sN>.
- LLT doubleElements() const {
- return LLT{IsPointer ? true : false, /*isVector=*/true,
- (uint16_t)(getNumElements() * 2), getScalarSizeInBits(),
- IsPointer ? getAddressSpace() : 0};
- }
-
void print(raw_ostream &OS) const;
bool operator==(const LLT &RHS) const {
diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h
index 73f0251a6b6..59c93f15d7b 100644
--- a/include/llvm/Support/MemoryBuffer.h
+++ b/include/llvm/Support/MemoryBuffer.h
@@ -136,7 +136,8 @@ public:
/// Map a subrange of the specified file as a MemoryBuffer.
static ErrorOr<std::unique_ptr<MemoryBuffer>>
- getFileSlice(const Twine &Filename, uint64_t MapSize, uint64_t Offset, bool IsVolatile = false);
+ getFileSlice(const Twine &Filename, uint64_t MapSize, uint64_t Offset,
+ bool IsVolatile = false);
//===--------------------------------------------------------------------===//
// Provided for performance analysis.
diff --git a/include/llvm/Support/SpecialCaseList.h b/include/llvm/Support/SpecialCaseList.h
index f76ca305efb..fd62fc48047 100644
--- a/include/llvm/Support/SpecialCaseList.h
+++ b/include/llvm/Support/SpecialCaseList.h
@@ -89,6 +89,17 @@ public:
bool inSection(StringRef Section, StringRef Prefix, StringRef Query,
StringRef Category = StringRef()) const;
+ /// Returns the line number corresponding to the special case list entry if
+ /// the special case list contains a line
+ /// \code
+ /// @Prefix:<E>=@Category
+ /// \endcode
+ /// where @Query satisfies wildcard expression <E> in a given @Section.
+ /// Returns zero if there is no blacklist entry corresponding to this
+ /// expression.
+ unsigned inSectionBlame(StringRef Section, StringRef Prefix, StringRef Query,
+ StringRef Category = StringRef()) const;
+
protected:
// Implementations of the create*() functions that can also be used by derived
// classes.
@@ -96,25 +107,25 @@ protected:
std::string &Error);
bool createInternal(const MemoryBuffer *MB, std::string &Error);
+ SpecialCaseList() = default;
SpecialCaseList(SpecialCaseList const &) = delete;
SpecialCaseList &operator=(SpecialCaseList const &) = delete;
/// Represents a set of regular expressions. Regular expressions which are
- /// "literal" (i.e. no regex metacharacters) are stored in Strings, while all
- /// others are represented as a single pipe-separated regex in RegEx. The
- /// reason for doing so is efficiency; StringSet is much faster at matching
+ /// "literal" (i.e. no regex metacharacters) are stored in Strings. The
+ /// reason for doing so is efficiency; StringMap is much faster at matching
/// literal strings than Regex.
class Matcher {
public:
- bool insert(std::string Regexp, std::string &REError);
- void compile();
- bool match(StringRef Query) const;
+ bool insert(std::string Regexp, unsigned LineNumber, std::string &REError);
+ // Returns the line number in the source file that this query matches to.
+ // Returns zero if no match is found.
+ unsigned match(StringRef Query) const;
private:
- StringSet<> Strings;
+ StringMap<unsigned> Strings;
TrigramIndex Trigrams;
- std::unique_ptr<Regex> RegEx;
- std::string UncompiledRegEx;
+ std::vector<std::pair<std::unique_ptr<Regex>, unsigned>> RegExes;
};
using SectionEntries = StringMap<StringMap<Matcher>>;
@@ -127,19 +138,15 @@ protected:
};
std::vector<Section> Sections;
- bool IsCompiled;
- SpecialCaseList();
/// Parses just-constructed SpecialCaseList entries from a memory buffer.
bool parse(const MemoryBuffer *MB, StringMap<size_t> &SectionsMap,
std::string &Error);
- /// compile() should be called once, after parsing all the memory buffers.
- void compile();
// Helper method for derived classes to search by Prefix, Query, and Category
// once they have already resolved a section entry.
- bool inSection(const SectionEntries &Entries, StringRef Prefix,
- StringRef Query, StringRef Category) const;
+ unsigned inSectionBlame(const SectionEntries &Entries, StringRef Prefix,
+ StringRef Query, StringRef Category) const;
};
} // namespace llvm
diff --git a/include/llvm/Support/TargetParser.h b/include/llvm/Support/TargetParser.h
index 6b56a635ff0..b3f91433bd9 100644
--- a/include/llvm/Support/TargetParser.h
+++ b/include/llvm/Support/TargetParser.h
@@ -167,10 +167,10 @@ enum ArchExtKind : unsigned {
AEK_PROFILE = 1 << 6,
AEK_RAS = 1 << 7,
AEK_LSE = 1 << 8,
- AEK_RDM = 1 << 9,
- AEK_SVE = 1 << 10,
- AEK_DOTPROD = 1 << 11,
- AEK_RCPC = 1 << 12
+ AEK_SVE = 1 << 9,
+ AEK_DOTPROD = 1 << 10,
+ AEK_RCPC = 1 << 11,
+ AEK_RDM = 1 << 12
};
StringRef getCanonicalArchName(StringRef Arch);
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index 048bd1f2a0c..7dc2aec324e 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -884,12 +884,16 @@ class InstrInfo {
// Standard Pseudo Instructions.
// This list must match TargetOpcodes.h and CodeGenTarget.cpp.
// Only these instructions are allowed in the TargetOpcode namespace.
-let isCodeGenOnly = 1, isPseudo = 1, hasNoSchedulingInfo = 1,
- Namespace = "TargetOpcode" in {
+// Ensure mayLoad and mayStore have a default value, so as not to break
+// targets that set guessInstructionProperties=0. Any local definition of
+// mayLoad/mayStore takes precedence over these default values.
+let mayLoad = 0, mayStore = 0, isCodeGenOnly = 1, isPseudo = 1,
+ hasNoSchedulingInfo = 1, Namespace = "TargetOpcode" in {
def PHI : Instruction {
let OutOperandList = (outs unknown:$dst);
let InOperandList = (ins variable_ops);
let AsmString = "PHINODE";
+ let hasSideEffects = 1;
}
def INLINEASM : Instruction {
let OutOperandList = (outs);
@@ -902,13 +906,15 @@ def CFI_INSTRUCTION : Instruction {
let InOperandList = (ins i32imm:$id);
let AsmString = "";
let hasCtrlDep = 1;
- let isNotDuplicable = 1;
+ let hasSideEffects = 1;
+ let isNotDuplicable = 0;
}
def EH_LABEL : Instruction {
let OutOperandList = (outs);
let InOperandList = (ins i32imm:$id);
let AsmString = "";
let hasCtrlDep = 1;
+ let hasSideEffects = 1;
let isNotDuplicable = 1;
}
def GC_LABEL : Instruction {
@@ -916,6 +922,7 @@ def GC_LABEL : Instruction {
let InOperandList = (ins i32imm:$id);
let AsmString = "";
let hasCtrlDep = 1;
+ let hasSideEffects = 1;
let isNotDuplicable = 1;
}
def ANNOTATION_LABEL : Instruction {
@@ -923,6 +930,7 @@ def ANNOTATION_LABEL : Instruction {
let InOperandList = (ins i32imm:$id);
let AsmString = "";
let hasCtrlDep = 1;
+ let hasSideEffects = 1;
let isNotDuplicable = 1;
}
def KILL : Instruction {
@@ -990,6 +998,7 @@ def BUNDLE : Instruction {
let OutOperandList = (outs);
let InOperandList = (ins variable_ops);
let AsmString = "BUNDLE";
+ let hasSideEffects = 1;
}
def LIFETIME_START : Instruction {
let OutOperandList = (outs);
@@ -1006,6 +1015,7 @@ def LIFETIME_END : Instruction {
def STACKMAP : Instruction {
let OutOperandList = (outs);
let InOperandList = (ins i64imm:$id, i32imm:$nbytes, variable_ops);
+ let hasSideEffects = 1;
let isCall = 1;
let mayLoad = 1;
let usesCustomInserter = 1;
@@ -1014,6 +1024,7 @@ def PATCHPOINT : Instruction {
let OutOperandList = (outs unknown:$dst);
let InOperandList = (ins i64imm:$id, i32imm:$nbytes, unknown:$callee,
i32imm:$nargs, i32imm:$cc, variable_ops);
+ let hasSideEffects = 1;
let isCall = 1;
let mayLoad = 1;
let usesCustomInserter = 1;
@@ -1048,6 +1059,7 @@ def FAULTING_OP : Instruction {
let OutOperandList = (outs unknown:$dst);
let InOperandList = (ins variable_ops);
let usesCustomInserter = 1;
+ let hasSideEffects = 1;
let mayLoad = 1;
let mayStore = 1;
let isTerminator = 1;
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index c1d0b32f7d7..994480ebc90 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -2678,6 +2678,15 @@ public:
const SelectionDAG &DAG,
unsigned Depth = 0) const;
+ /// Determine which of the bits of FrameIndex \p FIOp are known to be 0.
+ /// Default implementation computes low bits based on alignment
+ /// information. This should preserve known bits passed into it.
+ virtual void computeKnownBitsForFrameIndex(const SDValue FIOp,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const;
+
/// This method can be implemented by targets that want to expose additional
/// information about sign bits to the DAG Combiner. The DemandedElts
/// argument allows us to only collect the minimum sign bits that are shared
diff --git a/include/llvm/Transforms/PGOInstrumentation.h b/include/llvm/Transforms/PGOInstrumentation.h
index fa7a68624ec..c2cc76c422d 100644
--- a/include/llvm/Transforms/PGOInstrumentation.h
+++ b/include/llvm/Transforms/PGOInstrumentation.h
@@ -68,6 +68,8 @@ public:
void setProfMetadata(Module *M, Instruction *TI, ArrayRef<uint64_t> EdgeCounts,
uint64_t MaxCount);
+void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count);
+
} // end namespace llvm
#endif // LLVM_TRANSFORMS_PGOINSTRUMENTATION_H
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index 8ef65774a93..0cf1115dc97 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -73,6 +73,14 @@ FunctionPass *createDeadCodeEliminationPass();
//
FunctionPass *createDeadStoreEliminationPass();
+
+//===----------------------------------------------------------------------===//
+//
+// CallSiteSplitting - This pass split call-site based on its known argument
+// values.
+FunctionPass *createCallSiteSplittingPass();
+
+
//===----------------------------------------------------------------------===//
//
// AggressiveDCE - This pass uses the SSA based Aggressive DCE algorithm. This
@@ -422,7 +430,7 @@ Pass *createLowerGuardIntrinsicPass();
//===----------------------------------------------------------------------===//
//
-// MergeICmps - Merge integer comparison chains
+// MergeICmps - Merge integer comparison chains into a memcmp
//
Pass *createMergeICmpsPass();
diff --git a/include/llvm/Transforms/Scalar/CallSiteSplitting.h b/include/llvm/Transforms/Scalar/CallSiteSplitting.h
new file mode 100644
index 00000000000..5ab951a49f2
--- /dev/null
+++ b/include/llvm/Transforms/Scalar/CallSiteSplitting.h
@@ -0,0 +1,29 @@
+//===- CallSiteSplitting..h - Callsite Splitting ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H
+#define LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/Compiler.h"
+#include <vector>
+
+namespace llvm {
+
+struct CallSiteSplittingPass : PassInfoMixin<CallSiteSplittingPass> {
+ /// \brief Run the pass over the function.
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index 650224610ad..a59b188f8d6 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -331,15 +331,13 @@ public:
/// not have the "fast-math" property. Such operation requires a relaxed FP
/// mode.
bool hasUnsafeAlgebra() {
- return InductionBinOp &&
- !cast<FPMathOperator>(InductionBinOp)->hasUnsafeAlgebra();
+ return InductionBinOp && !cast<FPMathOperator>(InductionBinOp)->isFast();
}
/// Returns induction operator that does not have "fast-math" property
/// and requires FP unsafe mode.
Instruction *getUnsafeAlgebraInst() {
- if (!InductionBinOp ||
- cast<FPMathOperator>(InductionBinOp)->hasUnsafeAlgebra())
+ if (!InductionBinOp || cast<FPMathOperator>(InductionBinOp)->isFast())
return nullptr;
return InductionBinOp;
}
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 4a6abae8e97..fb9ece2bd20 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1672,9 +1672,9 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, uint64_t V1Size,
// If both pointers are pointing into the same object and one of them
// accesses the entire object, then the accesses must overlap in some way.
if (O1 == O2)
- if ((V1Size != MemoryLocation::UnknownSize &&
- isObjectSize(O1, V1Size, DL, TLI)) ||
- (V2Size != MemoryLocation::UnknownSize &&
+ if (V1Size != MemoryLocation::UnknownSize &&
+ V2Size != MemoryLocation::UnknownSize &&
+ (isObjectSize(O1, V1Size, DL, TLI) ||
isObjectSize(O2, V2Size, DL, TLI)))
return AliasCache[Locs] = PartialAlias;
diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp
index 5d2170dcf15..41c29589521 100644
--- a/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/lib/Analysis/BlockFrequencyInfo.cpp
@@ -218,6 +218,11 @@ BlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const {
return BFI->getProfileCountFromFreq(*getFunction(), Freq);
}
+bool BlockFrequencyInfo::isIrrLoopHeader(const BasicBlock *BB) {
+ assert(BFI && "Expected analysis to be available");
+ return BFI->isIrrLoopHeader(BB);
+}
+
void BlockFrequencyInfo::setBlockFreq(const BasicBlock *BB, uint64_t Freq) {
assert(BFI && "Expected analysis to be available");
BFI->setBlockFreq(BB, Freq);
diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 1030407b766..7e323022d9c 100644
--- a/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -271,6 +271,7 @@ void BlockFrequencyInfoImplBase::clear() {
// Swap with a default-constructed std::vector, since std::vector<>::clear()
// does not actually clear heap storage.
std::vector<FrequencyData>().swap(Freqs);
+ IsIrrLoopHeader.clear();
std::vector<WorkingData>().swap(Working);
Loops.clear();
}
@@ -280,8 +281,10 @@ void BlockFrequencyInfoImplBase::clear() {
/// Releases all memory not used downstream. In particular, saves Freqs.
static void cleanup(BlockFrequencyInfoImplBase &BFI) {
std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs));
+ SparseBitVector<> SavedIsIrrLoopHeader(std::move(BFI.IsIrrLoopHeader));
BFI.clear();
BFI.Freqs = std::move(SavedFreqs);
+ BFI.IsIrrLoopHeader = std::move(SavedIsIrrLoopHeader);
}
bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
@@ -572,6 +575,13 @@ BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F,
return BlockCount.getLimitedValue();
}
+bool
+BlockFrequencyInfoImplBase::isIrrLoopHeader(const BlockNode &Node) {
+ if (!Node.isValid())
+ return false;
+ return IsIrrLoopHeader.test(Node.Index);
+}
+
Scaled64
BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {
if (!Node.isValid())
@@ -819,3 +829,14 @@ void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) {
DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
}
}
+
+void BlockFrequencyInfoImplBase::distributeIrrLoopHeaderMass(Distribution &Dist) {
+ BlockMass LoopMass = BlockMass::getFull();
+ DitheringDistributer D(Dist, LoopMass);
+ for (const Weight &W : Dist.Weights) {
+ BlockMass Taken = D.takeMass(W.Amount);
+ assert(W.Type == Weight::Local && "all weights should be local");
+ Working[W.TargetNode.Index].getMass() = Taken;
+ DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
+ }
+}
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index 19889658b13..e141d6c58b6 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2136,8 +2136,51 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
if (!Stride)
return;
- DEBUG(dbgs() << "LAA: Found a strided access that we can version");
+ DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
+ "versioning:");
DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
+
+ // Avoid adding the "Stride == 1" predicate when we know that
+ // Stride >= Trip-Count. Such a predicate will effectively optimize a single
+ // or zero iteration loop, as Trip-Count <= Stride == 1.
+ //
+ // TODO: We are currently not making a very informed decision on when it is
+ // beneficial to apply stride versioning. It might make more sense that the
+ // users of this analysis (such as the vectorizer) will trigger it, based on
+ // their specific cost considerations; For example, in cases where stride
+ // versioning does not help resolving memory accesses/dependences, the
+ // vectorizer should evaluate the cost of the runtime test, and the benefit
+ // of various possible stride specializations, considering the alternatives
+ // of using gather/scatters (if available).
+
+ const SCEV *StrideExpr = PSE->getSCEV(Stride);
+ const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
+
+ // Match the types so we can compare the stride and the BETakenCount.
+ // The Stride can be positive/negative, so we sign extend Stride;
+ // The backdgeTakenCount is non-negative, so we zero extend BETakenCount.
+ const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+ uint64_t StrideTypeSize = DL.getTypeAllocSize(StrideExpr->getType());
+ uint64_t BETypeSize = DL.getTypeAllocSize(BETakenCount->getType());
+ const SCEV *CastedStride = StrideExpr;
+ const SCEV *CastedBECount = BETakenCount;
+ ScalarEvolution *SE = PSE->getSE();
+ if (BETypeSize >= StrideTypeSize)
+ CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
+ else
+ CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
+ const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
+ // Since TripCount == BackEdgeTakenCount + 1, checking:
+ // "Stride >= TripCount" is equivalent to checking:
+ // Stride - BETakenCount > 0
+ if (SE->isKnownPositive(StrideMinusBETaken)) {
+ DEBUG(dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
+ "Stride==1 predicate will imply that the loop executes "
+ "at most once.\n");
+ return;
+ }
+ DEBUG(dbgs() << "LAA: Found a strided access that we can version.");
+
SymbolicStrides[Ptr] = Stride;
StrideSet.insert(Stride);
}
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index afd575e7273..82db09ca97b 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -303,7 +303,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
// FIXME: refactor this to use the same code that inliner is using.
F.isVarArg();
GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport,
- /* Live = */ false);
+ /* Live = */ false, F.isDSOLocal());
FunctionSummary::FFlags FunFlags{
F.hasFnAttribute(Attribute::ReadNone),
F.hasFnAttribute(Attribute::ReadOnly),
@@ -329,7 +329,7 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V,
findRefEdges(Index, &V, RefEdges, Visited);
bool NonRenamableLocal = isNonRenamableLocal(V);
GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal,
- /* Live = */ false);
+ /* Live = */ false, V.isDSOLocal());
auto GVarSummary =
llvm::make_unique<GlobalVarSummary>(Flags, RefEdges.takeVector());
if (NonRenamableLocal)
@@ -342,7 +342,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A,
DenseSet<GlobalValue::GUID> &CantBePromoted) {
bool NonRenamableLocal = isNonRenamableLocal(A);
GlobalValueSummary::GVFlags Flags(A.getLinkage(), NonRenamableLocal,
- /* Live = */ false);
+ /* Live = */ false, A.isDSOLocal());
auto AS = llvm::make_unique<AliasSummary>(Flags);
auto *Aliasee = A.getBaseObject();
auto *AliaseeSummary = Index.getGlobalValueSummary(*Aliasee);
@@ -410,7 +410,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
assert(GV->isDeclaration() && "Def in module asm already has definition");
GlobalValueSummary::GVFlags GVFlags(GlobalValue::InternalLinkage,
/* NotEligibleToImport = */ true,
- /* Live = */ true);
+ /* Live = */ true,
+ /* Local */ GV->isDSOLocal());
CantBePromoted.insert(GlobalValue::getGUID(Name));
// Create the appropriate summary type.
if (Function *F = dyn_cast<Function>(GV)) {
diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp
index 3a3a7ad3955..1e36e314b86 100644
--- a/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -314,17 +314,8 @@ AliasResult TypeBasedAAResult::alias(const MemoryLocation &LocA,
if (!EnableTBAA)
return AAResultBase::alias(LocA, LocB);
- // Get the attached MDNodes. If either value lacks a tbaa MDNode, we must
- // be conservative.
- const MDNode *AM = LocA.AATags.TBAA;
- if (!AM)
- return AAResultBase::alias(LocA, LocB);
- const MDNode *BM = LocB.AATags.TBAA;
- if (!BM)
- return AAResultBase::alias(LocA, LocB);
-
- // If they may alias, chain to the next AliasAnalysis.
- if (Aliases(AM, BM))
+ // If accesses may alias, chain to the next AliasAnalysis.
+ if (Aliases(LocA.AATags.TBAA, LocB.AATags.TBAA))
return AAResultBase::alias(LocA, LocB);
// Otherwise return a definitive result.
@@ -424,25 +415,24 @@ bool MDNode::isTBAAVtableAccess() const {
return false;
}
+static bool matchAccessTags(const MDNode *A, const MDNode *B,
+ const MDNode **GenericTag = nullptr);
+
MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
+ const MDNode *GenericTag;
+ matchAccessTags(A, B, &GenericTag);
+ return const_cast<MDNode*>(GenericTag);
+}
+
+static const MDNode *getLeastCommonType(const MDNode *A, const MDNode *B) {
if (!A || !B)
return nullptr;
if (A == B)
return A;
- // For struct-path aware TBAA, we use the access type of the tag.
- assert(isStructPathTBAA(A) && isStructPathTBAA(B) &&
- "Auto upgrade should have taken care of this!");
- A = cast_or_null<MDNode>(MutableTBAAStructTagNode(A).getAccessType());
- if (!A)
- return nullptr;
- B = cast_or_null<MDNode>(MutableTBAAStructTagNode(B).getAccessType());
- if (!B)
- return nullptr;
-
- SmallSetVector<MDNode *, 4> PathA;
- MutableTBAANode TA(A);
+ SmallSetVector<const MDNode *, 4> PathA;
+ TBAANode TA(A);
while (TA.getNode()) {
if (PathA.count(TA.getNode()))
report_fatal_error("Cycle found in TBAA metadata.");
@@ -450,8 +440,8 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
TA = TA.getParent();
}
- SmallSetVector<MDNode *, 4> PathB;
- MutableTBAANode TB(B);
+ SmallSetVector<const MDNode *, 4> PathB;
+ TBAANode TB(B);
while (TB.getNode()) {
if (PathB.count(TB.getNode()))
report_fatal_error("Cycle found in TBAA metadata.");
@@ -462,7 +452,7 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
int IA = PathA.size() - 1;
int IB = PathB.size() - 1;
- MDNode *Ret = nullptr;
+ const MDNode *Ret = nullptr;
while (IA >= 0 && IB >= 0) {
if (PathA[IA] == PathB[IB])
Ret = PathA[IA];
@@ -472,17 +462,7 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
--IB;
}
- // We either did not find a match, or the only common base "type" is
- // the root node. In either case, we don't have any useful TBAA
- // metadata to attach.
- if (!Ret || Ret->getNumOperands() < 2)
- return nullptr;
-
- // We need to convert from a type node to a tag node.
- Type *Int64 = IntegerType::get(A->getContext(), 64);
- Metadata *Ops[3] = {Ret, Ret,
- ConstantAsMetadata::get(ConstantInt::get(Int64, 0))};
- return MDNode::get(A->getContext(), Ops);
+ return Ret;
}
void Instruction::getAAMetadata(AAMDNodes &N, bool Merge) const {
@@ -505,70 +485,96 @@ void Instruction::getAAMetadata(AAMDNodes &N, bool Merge) const {
N.NoAlias = getMetadata(LLVMContext::MD_noalias);
}
-/// Aliases - Test whether the type represented by A may alias the
-/// type represented by B.
-bool TypeBasedAAResult::Aliases(const MDNode *A, const MDNode *B) const {
- // Verify that both input nodes are struct-path aware. Auto-upgrade should
- // have taken care of this.
- assert(isStructPathTBAA(A) && "MDNode A is not struct-path aware.");
- assert(isStructPathTBAA(B) && "MDNode B is not struct-path aware.");
+static bool findAccessType(TBAAStructTagNode BaseTag,
+ const MDNode *AccessTypeNode,
+ uint64_t &OffsetInBase) {
+ // Start from the base type, follow the edge with the correct offset in
+ // the type DAG and adjust the offset until we reach the access type or
+ // until we reach a root node.
+ TBAAStructTypeNode BaseType(BaseTag.getBaseType());
+ OffsetInBase = BaseTag.getOffset();
+
+ while (const MDNode *BaseTypeNode = BaseType.getNode()) {
+ if (BaseTypeNode == AccessTypeNode)
+ return true;
- // Keep track of the root node for A and B.
- TBAAStructTypeNode RootA, RootB;
- TBAAStructTagNode TagA(A), TagB(B);
+ // Follow the edge with the correct offset, Offset will be adjusted to
+ // be relative to the field type.
+ BaseType = BaseType.getParent(OffsetInBase);
+ }
+ return false;
+}
- // TODO: We need to check if AccessType of TagA encloses AccessType of
- // TagB to support aggregate AccessType. If yes, return true.
+static const MDNode *createAccessTag(const MDNode *AccessType) {
+ // If there is no access type or the access type is the root node, then
+ // we don't have any useful access tag to return.
+ if (!AccessType || AccessType->getNumOperands() < 2)
+ return nullptr;
- // Start from the base type of A, follow the edge with the correct offset in
- // the type DAG and adjust the offset until we reach the base type of B or
- // until we reach the Root node.
- // Compare the adjusted offset once we have the same base.
+ Type *Int64 = IntegerType::get(AccessType->getContext(), 64);
+ auto *ImmutabilityFlag = ConstantAsMetadata::get(ConstantInt::get(Int64, 0));
+ Metadata *Ops[] = {const_cast<MDNode*>(AccessType),
+ const_cast<MDNode*>(AccessType), ImmutabilityFlag};
+ return MDNode::get(AccessType->getContext(), Ops);
+}
- // Climb the type DAG from base type of A to see if we reach base type of B.
- const MDNode *BaseA = TagA.getBaseType();
- const MDNode *BaseB = TagB.getBaseType();
- uint64_t OffsetA = TagA.getOffset(), OffsetB = TagB.getOffset();
- for (TBAAStructTypeNode T(BaseA);;) {
- if (T.getNode() == BaseB)
- // Base type of A encloses base type of B, check if the offsets match.
- return OffsetA == OffsetB;
-
- RootA = T;
- // Follow the edge with the correct offset, OffsetA will be adjusted to
- // be relative to the field type.
- T = T.getParent(OffsetA);
- if (!T.getNode())
- break;
+/// matchTags - Return true if the given couple of accesses are allowed to
+/// overlap. If \arg GenericTag is not null, then on return it points to the
+/// most generic access descriptor for the given two.
+static bool matchAccessTags(const MDNode *A, const MDNode *B,
+ const MDNode **GenericTag) {
+ if (A == B) {
+ if (GenericTag)
+ *GenericTag = A;
+ return true;
}
- // Reset OffsetA and climb the type DAG from base type of B to see if we reach
- // base type of A.
- OffsetA = TagA.getOffset();
- for (TBAAStructTypeNode T(BaseB);;) {
- if (T.getNode() == BaseA)
- // Base type of B encloses base type of A, check if the offsets match.
- return OffsetA == OffsetB;
-
- RootB = T;
- // Follow the edge with the correct offset, OffsetB will be adjusted to
- // be relative to the field type.
- T = T.getParent(OffsetB);
- if (!T.getNode())
- break;
+ // Accesses with no TBAA information may alias with any other accesses.
+ if (!A || !B) {
+ if (GenericTag)
+ *GenericTag = nullptr;
+ return true;
}
- // Neither node is an ancestor of the other.
+ // Verify that both input nodes are struct-path aware. Auto-upgrade should
+ // have taken care of this.
+ assert(isStructPathTBAA(A) && "Access A is not struct-path aware!");
+ assert(isStructPathTBAA(B) && "Access B is not struct-path aware!");
+
+ TBAAStructTagNode TagA(A), TagB(B);
+ const MDNode *CommonType = getLeastCommonType(TagA.getAccessType(),
+ TagB.getAccessType());
+ if (GenericTag)
+ *GenericTag = createAccessTag(CommonType);
- // If they have different roots, they're part of different potentially
- // unrelated type systems, so we must be conservative.
- if (RootA.getNode() != RootB.getNode())
+ // TODO: We need to check if AccessType of TagA encloses AccessType of
+ // TagB to support aggregate AccessType. If yes, return true.
+
+ // Climb the type DAG from base type of A to see if we reach base type of B.
+ uint64_t OffsetA;
+ if (findAccessType(TagA, TagB.getBaseType(), OffsetA))
+ return OffsetA == TagB.getOffset();
+
+ // Climb the type DAG from base type of B to see if we reach base type of A.
+ uint64_t OffsetB;
+ if (findAccessType(TagB, TagA.getBaseType(), OffsetB))
+ return OffsetB == TagA.getOffset();
+
+ // If the final access types have different roots, they're part of different
+ // potentially unrelated type systems, so we must be conservative.
+ if (!CommonType)
return true;
// If they have the same root, then we've proved there's no alias.
return false;
}
+/// Aliases - Test whether the access represented by tag A may alias the
+/// access represented by tag B.
+bool TypeBasedAAResult::Aliases(const MDNode *A, const MDNode *B) const {
+ return matchAccessTags(A, B);
+}
+
AnalysisKey TypeBasedAA::Key;
TypeBasedAAResult TypeBasedAA::run(Function &F, FunctionAnalysisManager &AM) {
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 3f53fc517e3..2010858139a 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -2579,9 +2579,7 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(ImmutableCallSite ICS,
case LibFunc_sqrt:
case LibFunc_sqrtf:
case LibFunc_sqrtl:
- if (ICS->hasNoNaNs())
- return Intrinsic::sqrt;
- return Intrinsic::not_intrinsic;
+ return Intrinsic::sqrt;
}
return Intrinsic::not_intrinsic;
@@ -4140,7 +4138,8 @@ static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
// Is the sign bit set?
// (X <s 0) ? X : MAXVAL ==> (X >u MAXVAL) ? X : MAXVAL ==> UMAX
// (X <s 0) ? MAXVAL : X ==> (X >u MAXVAL) ? MAXVAL : X ==> UMIN
- if (Pred == CmpInst::ICMP_SLT && *C1 == 0 && C2->isMaxSignedValue())
+ if (Pred == CmpInst::ICMP_SLT && C1->isNullValue() &&
+ C2->isMaxSignedValue())
return {CmpLHS == TrueVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};
// Is the sign bit clear?
@@ -4272,13 +4271,15 @@ static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
// ABS(X) ==> (X >s 0) ? X : -X and (X >s -1) ? X : -X
// NABS(X) ==> (X >s 0) ? -X : X and (X >s -1) ? -X : X
- if (Pred == ICmpInst::ICMP_SGT && (*C1 == 0 || C1->isAllOnesValue())) {
+ if (Pred == ICmpInst::ICMP_SGT &&
+ (C1->isNullValue() || C1->isAllOnesValue())) {
return {(CmpLHS == TrueVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false};
}
// ABS(X) ==> (X <s 0) ? -X : X and (X <s 1) ? -X : X
// NABS(X) ==> (X <s 0) ? X : -X and (X <s 1) ? X : -X
- if (Pred == ICmpInst::ICMP_SLT && (*C1 == 0 || *C1 == 1)) {
+ if (Pred == ICmpInst::ICMP_SLT &&
+ (C1->isNullValue() || C1->isOneValue())) {
return {(CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false};
}
}
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 50b391fdf73..b8b56d79c82 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -552,6 +552,8 @@ lltok::Kind LLLexer::LexIdentifier() {
KEYWORD(nsz);
KEYWORD(arcp);
KEYWORD(contract);
+ KEYWORD(reassoc);
+ KEYWORD(afn);
KEYWORD(fast);
KEYWORD(nuw);
KEYWORD(nsw);
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index dcc3f22e03b..94e4c1ae96d 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -193,7 +193,7 @@ namespace llvm {
FastMathFlags FMF;
while (true)
switch (Lex.getKind()) {
- case lltok::kw_fast: FMF.setUnsafeAlgebra(); Lex.Lex(); continue;
+ case lltok::kw_fast: FMF.setFast(); Lex.Lex(); continue;
case lltok::kw_nnan: FMF.setNoNaNs(); Lex.Lex(); continue;
case lltok::kw_ninf: FMF.setNoInfs(); Lex.Lex(); continue;
case lltok::kw_nsz: FMF.setNoSignedZeros(); Lex.Lex(); continue;
@@ -202,6 +202,8 @@ namespace llvm {
FMF.setAllowContract(true);
Lex.Lex();
continue;
+ case lltok::kw_reassoc: FMF.setAllowReassoc(); Lex.Lex(); continue;
+ case lltok::kw_afn: FMF.setApproxFunc(); Lex.Lex(); continue;
default: return FMF;
}
return FMF;
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index db0de6c0d5a..0c5cf6b5d45 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -102,6 +102,8 @@ enum Kind {
kw_nsz,
kw_arcp,
kw_contract,
+ kw_reassoc,
+ kw_afn,
kw_fast,
kw_nuw,
kw_nsw,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index c2272260f44..3e0a39c099b 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -889,7 +889,9 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags,
// to work correctly on earlier versions, we must conservatively treat all
// values as live.
bool Live = (RawFlags & 0x2) || Version < 3;
- return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live);
+ bool Local = (RawFlags & 0x4);
+
+ return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live, Local);
}
static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) {
@@ -1044,8 +1046,8 @@ static Comdat::SelectionKind getDecodedComdatSelectionKind(unsigned Val) {
static FastMathFlags getDecodedFastMathFlags(unsigned Val) {
FastMathFlags FMF;
- if (0 != (Val & FastMathFlags::UnsafeAlgebra))
- FMF.setUnsafeAlgebra();
+ if (0 != (Val & FastMathFlags::AllowReassoc))
+ FMF.setAllowReassoc();
if (0 != (Val & FastMathFlags::NoNaNs))
FMF.setNoNaNs();
if (0 != (Val & FastMathFlags::NoInfs))
@@ -1056,6 +1058,8 @@ static FastMathFlags getDecodedFastMathFlags(unsigned Val) {
FMF.setAllowReciprocal();
if (0 != (Val & FastMathFlags::AllowContract))
FMF.setAllowContract(true);
+ if (0 != (Val & FastMathFlags::ApproxFunc))
+ FMF.setApproxFunc();
return FMF;
}
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 1e491aa066e..03a77c9734e 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -955,6 +955,8 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) {
RawFlags |= Flags.NotEligibleToImport; // bool
RawFlags |= (Flags.Live << 1);
+ RawFlags |= (Flags.DSOLocal << 2);
+
// Linkage don't need to be remapped at that time for the summary. Any future
// change to the getEncodedLinkage() function will need to be taken into
// account here as well.
@@ -1319,8 +1321,8 @@ static uint64_t getOptimizationFlags(const Value *V) {
if (PEO->isExact())
Flags |= 1 << bitc::PEO_EXACT;
} else if (const auto *FPMO = dyn_cast<FPMathOperator>(V)) {
- if (FPMO->hasUnsafeAlgebra())
- Flags |= FastMathFlags::UnsafeAlgebra;
+ if (FPMO->hasAllowReassoc())
+ Flags |= FastMathFlags::AllowReassoc;
if (FPMO->hasNoNaNs())
Flags |= FastMathFlags::NoNaNs;
if (FPMO->hasNoInfs())
@@ -1331,6 +1333,8 @@ static uint64_t getOptimizationFlags(const Value *V) {
Flags |= FastMathFlags::AllowReciprocal;
if (FPMO->hasAllowContract())
Flags |= FastMathFlags::AllowContract;
+ if (FPMO->hasApproxFunc())
+ Flags |= FastMathFlags::ApproxFunc;
}
return Flags;
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index 730187087dc..011356c3260 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -18,6 +18,8 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/UniqueVector.h"
#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
#include "llvm/IR/UseListOrder.h"
#include <cassert>
#include <cstdint>
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index d7f91fc1ce3..1dea746a6ac 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -28,12 +28,12 @@
#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cassert>
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 876cca4bc7a..9642368a047 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -15,6 +15,7 @@
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
@@ -24,7 +25,6 @@
#include "llvm/IR/Module.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include "llvm/Transforms/Utils/GlobalStatus.h"
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 8b1376ab363..973816d5635 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -29,7 +29,7 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/FormattedStream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Target/TargetRegisterInfo.h"
using namespace llvm;
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index a35fcdaaf9a..4ebc7176943 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -51,6 +51,8 @@
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Comdat.h"
#include "llvm/IR/Constant.h"
@@ -100,8 +102,6 @@
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/Timer.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index eae79ad101d..5250f1b1787 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -17,6 +17,7 @@
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/InlineAsm.h"
@@ -32,7 +33,6 @@
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 67bab8c7684..5aa3f4ae103 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -68,7 +68,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/ScopedPrinter.h"
#include "llvm/Support/SMLoc.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index dd7f7931b06..1a6cb967992 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -31,7 +31,7 @@
#include "llvm/MC/MachineLocation.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FormattedStream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 06b5b06c41b..603d0f7f470 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -36,7 +36,7 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/Casting.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index 6e4625ba411..167ca13c19c 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -15,6 +15,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/DIE.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/Support/Allocator.h"
#include <memory>
#include <utility>
@@ -27,7 +28,6 @@ class DwarfCompileUnit;
class DwarfUnit;
class LexicalScope;
class MCSection;
-class MDNode;
class DwarfFile {
// Target of Dwarf emission, used for sizing of abbreviations.
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index 5d485f21357..35ce1fec385 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -33,7 +33,7 @@
#include "llvm/MC/MCWin64EH.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FormattedStream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 40cb0c0cdf1..cd6056b674c 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -39,6 +39,7 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
@@ -52,7 +53,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
@@ -296,6 +296,11 @@ static unsigned HashEndOfMBB(const MachineBasicBlock &MBB) {
return HashMachineInstr(*I);
}
+/// Whether MI should be counted as an instruction when calculating common tail.
+static bool countsAsInstruction(const MachineInstr &MI) {
+ return !(MI.isDebugValue() || MI.isCFIInstruction());
+}
+
/// ComputeCommonTailLength - Given two machine basic blocks, compute the number
/// of instructions they actually have in common together at their end. Return
/// iterators for the first shared instruction in each block.
@@ -310,26 +315,27 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
while (I1 != MBB1->begin() && I2 != MBB2->begin()) {
--I1; --I2;
// Skip debugging pseudos; necessary to avoid changing the code.
- while (I1->isDebugValue()) {
+ while (!countsAsInstruction(*I1)) {
if (I1==MBB1->begin()) {
- while (I2->isDebugValue()) {
- if (I2==MBB2->begin())
+ while (!countsAsInstruction(*I2)) {
+ if (I2==MBB2->begin()) {
// I1==DBG at begin; I2==DBG at begin
- return TailLen;
+ goto SkipTopCFIAndReturn;
+ }
--I2;
}
++I2;
// I1==DBG at begin; I2==non-DBG, or first of DBGs not at begin
- return TailLen;
+ goto SkipTopCFIAndReturn;
}
--I1;
}
// I1==first (untested) non-DBG preceding known match
- while (I2->isDebugValue()) {
+ while (!countsAsInstruction(*I2)) {
if (I2==MBB2->begin()) {
++I1;
// I1==non-DBG, or first of DBGs not at begin; I2==DBG at begin
- return TailLen;
+ goto SkipTopCFIAndReturn;
}
--I2;
}
@@ -368,6 +374,37 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
}
++I1;
}
+
+SkipTopCFIAndReturn:
+ // Ensure that I1 and I2 do not point to a CFI_INSTRUCTION. This can happen if
+ // I1 and I2 are non-identical when compared and then one or both of them ends
+ // up pointing to a CFI instruction after being incremented. For example:
+ /*
+ BB1:
+ ...
+ INSTRUCTION_A
+ ADD32ri8 <- last common instruction
+ ...
+ BB2:
+ ...
+ INSTRUCTION_B
+ CFI_INSTRUCTION
+ ADD32ri8 <- last common instruction
+ ...
+ */
+ // When INSTRUCTION_A and INSTRUCTION_B are compared as not equal, after
+ // incrementing the iterators, I1 will point to ADD, however I2 will point to
+ // the CFI instruction. Later on, this leads to BB2 being 'hacked off' at the
+ // wrong place (in ReplaceTailWithBranchTo()) which results in losing this CFI
+ // instruction.
+ while (I1 != MBB1->end() && I1->isCFIInstruction()) {
+ ++I1;
+ }
+
+ while (I2 != MBB2->end() && I2->isCFIInstruction()) {
+ ++I2;
+ }
+
return TailLen;
}
@@ -454,7 +491,7 @@ static unsigned EstimateRuntime(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator E) {
unsigned Time = 0;
for (; I != E; ++I) {
- if (I->isDebugValue())
+ if (!countsAsInstruction(*I))
continue;
if (I->isCall())
Time += 10;
@@ -814,12 +851,12 @@ mergeOperations(MachineBasicBlock::iterator MBBIStartPos,
assert(MBBI != MBBIE && "Reached BB end within common tail length!");
(void)MBBIE;
- if (MBBI->isDebugValue()) {
+ if (!countsAsInstruction(*MBBI)) {
++MBBI;
continue;
}
- while ((MBBICommon != MBBIECommon) && MBBICommon->isDebugValue())
+ while ((MBBICommon != MBBIECommon) && !countsAsInstruction(*MBBICommon))
++MBBICommon;
assert(MBBICommon != MBBIECommon &&
@@ -859,7 +896,7 @@ void BranchFolder::mergeCommonTails(unsigned commonTailIndex) {
}
for (auto &MI : *MBB) {
- if (MI.isDebugValue())
+ if (!countsAsInstruction(MI))
continue;
DebugLoc DL = MI.getDebugLoc();
for (unsigned int i = 0 ; i < NextCommonInsts.size() ; i++) {
@@ -869,7 +906,7 @@ void BranchFolder::mergeCommonTails(unsigned commonTailIndex) {
auto &Pos = NextCommonInsts[i];
assert(Pos != SameTails[i].getBlock()->end() &&
"Reached BB end within common tail");
- while (Pos->isDebugValue()) {
+ while (!countsAsInstruction(*Pos)) {
++Pos;
assert(Pos != SameTails[i].getBlock()->end() &&
"Reached BB end within common tail");
diff --git a/lib/CodeGen/BranchRelaxation.cpp b/lib/CodeGen/BranchRelaxation.cpp
index 2d21fbeea39..73b399e4444 100644
--- a/lib/CodeGen/BranchRelaxation.cpp
+++ b/lib/CodeGen/BranchRelaxation.cpp
@@ -15,6 +15,7 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/Pass.h"
#include "llvm/Support/Compiler.h"
@@ -22,7 +23,6 @@
#include "llvm/Support/Format.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cassert>
diff --git a/lib/CodeGen/CFIInstrInserter.cpp b/lib/CodeGen/CFIInstrInserter.cpp
new file mode 100644
index 00000000000..5464ee443e0
--- /dev/null
+++ b/lib/CodeGen/CFIInstrInserter.cpp
@@ -0,0 +1,319 @@
+//===------ CFIInstrInserter.cpp - Insert additional CFI instructions -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass verifies incoming and outgoing CFA information of basic
+/// blocks. CFA information is information about offset and register set by CFI
+/// directives, valid at the start and end of a basic block. This pass checks
+/// that outgoing information of predecessors matches incoming information of
+/// their successors. Then it checks if blocks have correct CFA calculation rule
+/// set and inserts additional CFI instruction at their beginnings if they
+/// don't. CFI instructions are inserted if basic blocks have incorrect offset
+/// or register set by previous blocks, as a result of a non-linear layout of
+/// blocks in a function.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+namespace {
+class CFIInstrInserter : public MachineFunctionPass {
+ public:
+ static char ID;
+
+ CFIInstrInserter() : MachineFunctionPass(ID) {
+ initializeCFIInstrInserterPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+
+ if (!MF.getMMI().hasDebugInfo() &&
+ !MF.getFunction()->needsUnwindTableEntry())
+ return false;
+
+ MBBVector.resize(MF.getNumBlockIDs());
+ calculateCFAInfo(MF);
+#ifndef NDEBUG
+ unsigned ErrorNum = verify(MF);
+ if (ErrorNum)
+ report_fatal_error("Found " + Twine(ErrorNum) +
+ " in/out CFI information errors.");
+#endif
+ bool insertedCFI = insertCFIInstrs(MF);
+ MBBVector.clear();
+ return insertedCFI;
+ }
+
+ private:
+ struct MBBCFAInfo {
+ MachineBasicBlock *MBB;
+ /// Value of cfa offset valid at basic block entry.
+ int IncomingCFAOffset = -1;
+ /// Value of cfa offset valid at basic block exit.
+ int OutgoingCFAOffset = -1;
+ /// Value of cfa register valid at basic block entry.
+ unsigned IncomingCFARegister = 0;
+ /// Value of cfa register valid at basic block exit.
+ unsigned OutgoingCFARegister = 0;
+ /// If in/out cfa offset and register values for this block have already
+ /// been set or not.
+ bool Processed = false;
+ };
+
+ /// Contains cfa offset and register values valid at entry and exit of basic
+ /// blocks.
+ SmallVector<struct MBBCFAInfo, 4> MBBVector;
+
+ /// Calculate cfa offset and register values valid at entry and exit for all
+ /// basic blocks in a function.
+ void calculateCFAInfo(MachineFunction &MF);
+ /// Calculate cfa offset and register values valid at basic block exit by
+ /// checking the block for CFI instructions. Block's incoming CFA info remains
+ /// the same.
+ void calculateOutgoingCFAInfo(struct MBBCFAInfo &MBBInfo);
+ /// Update in/out cfa offset and register values for successors of the basic
+ /// block.
+ void updateSuccCFAInfo(struct MBBCFAInfo &MBBInfo);
+
+ /// Check if incoming CFA information of a basic block matches outgoing CFA
+ /// information of the previous block. If it doesn't, insert CFI instruction
+ /// at the beginning of the block that corrects the CFA calculation rule for
+ /// that block.
+ bool insertCFIInstrs(MachineFunction &MF);
+ /// Return the cfa offset value that should be set at the beginning of a MBB
+ /// if needed. The negated value is needed when creating CFI instructions that
+ /// set absolute offset.
+ int getCorrectCFAOffset(MachineBasicBlock *MBB) {
+ return -MBBVector[MBB->getNumber()].IncomingCFAOffset;
+ }
+
+ void report(const char *msg, MachineBasicBlock &MBB);
+ /// Go through each MBB in a function and check that outgoing offset and
+ /// register of its predecessors match incoming offset and register of that
+ /// MBB, as well as that incoming offset and register of its successors match
+ /// outgoing offset and register of the MBB.
+ unsigned verify(MachineFunction &MF);
+};
+}
+
+char CFIInstrInserter::ID = 0;
+INITIALIZE_PASS(CFIInstrInserter, "cfi-instr-inserter",
+ "Check CFA info and insert CFI instructions if needed", false,
+ false)
+FunctionPass *llvm::createCFIInstrInserter() { return new CFIInstrInserter(); }
+
+void CFIInstrInserter::calculateCFAInfo(MachineFunction &MF) {
+ // Initial CFA offset value i.e. the one valid at the beginning of the
+ // function.
+ int InitialOffset =
+ MF.getSubtarget().getFrameLowering()->getInitialCFAOffset(MF);
+ // Initial CFA register value i.e. the one valid at the beginning of the
+ // function.
+ unsigned InitialRegister =
+ MF.getSubtarget().getFrameLowering()->getInitialCFARegister(MF);
+
+ // Initialize MBBMap.
+ for (MachineBasicBlock &MBB : MF) {
+ struct MBBCFAInfo MBBInfo;
+ MBBInfo.MBB = &MBB;
+ MBBInfo.IncomingCFAOffset = InitialOffset;
+ MBBInfo.OutgoingCFAOffset = InitialOffset;
+ MBBInfo.IncomingCFARegister = InitialRegister;
+ MBBInfo.OutgoingCFARegister = InitialRegister;
+ MBBVector[MBB.getNumber()] = MBBInfo;
+ }
+
+ // Set in/out cfa info for all blocks in the function. This traversal is based
+ // on the assumption that the first block in the function is the entry block
+ // i.e. that it has initial cfa offset and register values as incoming CFA
+ // information.
+ for (MachineBasicBlock &MBB : MF) {
+ if (MBBVector[MBB.getNumber()].Processed) continue;
+ calculateOutgoingCFAInfo(MBBVector[MBB.getNumber()]);
+ updateSuccCFAInfo(MBBVector[MBB.getNumber()]);
+ }
+}
+
+void CFIInstrInserter::calculateOutgoingCFAInfo(struct MBBCFAInfo &MBBInfo) {
+ // Outgoing cfa offset set by the block.
+ int SetOffset = MBBInfo.IncomingCFAOffset;
+ // Outgoing cfa register set by the block.
+ unsigned SetRegister = MBBInfo.IncomingCFARegister;
+ const std::vector<MCCFIInstruction> &Instrs =
+ MBBInfo.MBB->getParent()->getFrameInstructions();
+
+ // Determine cfa offset and register set by the block.
+ for (MachineInstr &MI :
+ make_range(MBBInfo.MBB->instr_begin(), MBBInfo.MBB->instr_end())) {
+ if (MI.isCFIInstruction()) {
+ unsigned CFIIndex = MI.getOperand(0).getCFIIndex();
+ const MCCFIInstruction &CFI = Instrs[CFIIndex];
+ if (CFI.getOperation() == MCCFIInstruction::OpDefCfaRegister) {
+ SetRegister = CFI.getRegister();
+ } else if (CFI.getOperation() == MCCFIInstruction::OpDefCfaOffset) {
+ SetOffset = CFI.getOffset();
+ } else if (CFI.getOperation() == MCCFIInstruction::OpAdjustCfaOffset) {
+ SetOffset += CFI.getOffset();
+ } else if (CFI.getOperation() == MCCFIInstruction::OpDefCfa) {
+ SetRegister = CFI.getRegister();
+ SetOffset = CFI.getOffset();
+ }
+ }
+ }
+
+ MBBInfo.Processed = true;
+
+ // Update outgoing CFA info.
+ MBBInfo.OutgoingCFAOffset = SetOffset;
+ MBBInfo.OutgoingCFARegister = SetRegister;
+}
+
+void CFIInstrInserter::updateSuccCFAInfo(struct MBBCFAInfo &MBBInfo) {
+
+ for (MachineBasicBlock *Succ : MBBInfo.MBB->successors()) {
+ struct MBBCFAInfo &SuccInfo = MBBVector[Succ->getNumber()];
+ if (SuccInfo.Processed) continue;
+ SuccInfo.IncomingCFAOffset = MBBInfo.OutgoingCFAOffset;
+ SuccInfo.IncomingCFARegister = MBBInfo.OutgoingCFARegister;
+ calculateOutgoingCFAInfo(SuccInfo);
+ updateSuccCFAInfo(SuccInfo);
+ }
+}
+
+bool CFIInstrInserter::insertCFIInstrs(MachineFunction &MF) {
+
+ const struct MBBCFAInfo *PrevMBBInfo = &MBBVector[MF.front().getNumber()];
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ bool InsertedCFIInstr = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ // Skip the first MBB in a function
+ if (MBB.getNumber() == MF.front().getNumber()) continue;
+
+ const struct MBBCFAInfo& MBBInfo = MBBVector[MBB.getNumber()];
+ auto MBBI = MBBInfo.MBB->begin();
+ DebugLoc DL = MBBInfo.MBB->findDebugLoc(MBBI);
+
+ if (PrevMBBInfo->OutgoingCFAOffset != MBBInfo.IncomingCFAOffset) {
+ // If both outgoing offset and register of a previous block don't match
+ // incoming offset and register of this block, add a def_cfa instruction
+ // with the correct offset and register for this block.
+ if (PrevMBBInfo->OutgoingCFARegister != MBBInfo.IncomingCFARegister) {
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+ nullptr, MBBInfo.IncomingCFARegister, getCorrectCFAOffset(&MBB)));
+ BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ // If outgoing offset of a previous block doesn't match incoming offset
+ // of this block, add a def_cfa_offset instruction with the correct
+ // offset for this block.
+ } else {
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(
+ nullptr, getCorrectCFAOffset(&MBB)));
+ BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+ InsertedCFIInstr = true;
+ // If outgoing register of a previous block doesn't match incoming
+ // register of this block, add a def_cfa_register instruction with the
+ // correct register for this block.
+ } else if (PrevMBBInfo->OutgoingCFARegister != MBBInfo.IncomingCFARegister) {
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+ nullptr, MBBInfo.IncomingCFARegister));
+ BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ InsertedCFIInstr = true;
+ }
+ PrevMBBInfo = &MBBInfo;
+ }
+ return InsertedCFIInstr;
+}
+
+void CFIInstrInserter::report(const char *msg, MachineBasicBlock &MBB) {
+ errs() << '\n';
+ errs() << "*** " << msg << " ***\n"
+ << "- function: " << MBB.getParent()->getName() << "\n";
+ errs() << "- basic block: BB#" << MBB.getNumber() << ' ' << MBB.getName()
+ << " (" << (const void *)&MBB << ')';
+ errs() << '\n';
+}
+
+unsigned CFIInstrInserter::verify(MachineFunction &MF) {
+ unsigned ErrorNum = 0;
+ for (MachineBasicBlock &CurrMBB : MF) {
+ const struct MBBCFAInfo& CurrMBBInfo = MBBVector[CurrMBB.getNumber()];
+ for (MachineBasicBlock *Pred : CurrMBB.predecessors()) {
+ const struct MBBCFAInfo& PredMBBInfo = MBBVector[Pred->getNumber()];
+ // Check that outgoing offset values of predecessors match the incoming
+ // offset value of CurrMBB
+ if (PredMBBInfo.OutgoingCFAOffset != CurrMBBInfo.IncomingCFAOffset) {
+ report("The outgoing offset of a predecessor is inconsistent.",
+ CurrMBB);
+ errs() << "Predecessor BB#" << Pred->getNumber()
+ << " has outgoing offset (" << PredMBBInfo.OutgoingCFAOffset
+ << "), while BB#" << CurrMBB.getNumber()
+ << " has incoming offset (" << CurrMBBInfo.IncomingCFAOffset
+ << ").\n";
+ ErrorNum++;
+ }
+ // Check that outgoing register values of predecessors match the incoming
+ // register value of CurrMBB
+ if (PredMBBInfo.OutgoingCFARegister != CurrMBBInfo.IncomingCFARegister) {
+ report("The outgoing register of a predecessor is inconsistent.",
+ CurrMBB);
+ errs() << "Predecessor BB#" << Pred->getNumber()
+ << " has outgoing register (" << PredMBBInfo.OutgoingCFARegister
+ << "), while BB#" << CurrMBB.getNumber()
+ << " has incoming register (" << CurrMBBInfo.IncomingCFARegister
+ << ").\n";
+ ErrorNum++;
+ }
+ }
+
+ for (MachineBasicBlock *Succ : CurrMBB.successors()) {
+ const struct MBBCFAInfo& SuccMBBInfo = MBBVector[Succ->getNumber()];
+ // Check that incoming offset values of successors match the outgoing
+ // offset value of CurrMBB
+ if (SuccMBBInfo.IncomingCFAOffset != CurrMBBInfo.OutgoingCFAOffset) {
+ report("The incoming offset of a successor is inconsistent.", CurrMBB);
+ errs() << "Successor BB#" << Succ->getNumber()
+ << " has incoming offset (" << SuccMBBInfo.IncomingCFAOffset
+ << "), while BB#" << CurrMBB.getNumber()
+ << " has outgoing offset (" << CurrMBBInfo.OutgoingCFAOffset
+ << ").\n";
+ ErrorNum++;
+ }
+ // Check that incoming register values of successors match the outgoing
+ // register value of CurrMBB
+ if (SuccMBBInfo.IncomingCFARegister != CurrMBBInfo.OutgoingCFARegister) {
+ report("The incoming register of a successor is inconsistent.",
+ CurrMBB);
+ errs() << "Successor BB#" << Succ->getNumber()
+ << " has incoming register (" << SuccMBBInfo.IncomingCFARegister
+ << "), while BB#" << CurrMBB.getNumber()
+ << " has outgoing register (" << CurrMBBInfo.OutgoingCFARegister
+ << ").\n";
+ ErrorNum++;
+ }
+ }
+ }
+ return ErrorNum;
+}
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 7ec7fda4e44..1ae5aa80bf9 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -9,6 +9,7 @@ add_llvm_library(LLVMCodeGen
BuiltinGCs.cpp
CalcSpillWeights.cpp
CallingConvLower.cpp
+ CFIInstrInserter.cpp
CodeGen.cpp
CodeGenPrepare.cpp
CountingFunctionInserter.cpp
@@ -21,6 +22,7 @@ add_llvm_library(LLVMCodeGen
EdgeBundles.cpp
ExecutionDepsFix.cpp
ExpandISelPseudos.cpp
+ ExpandMemCmp.cpp
ExpandPostRAPseudos.cpp
ExpandReductions.cpp
FaultMaps.cpp
@@ -113,6 +115,7 @@ add_llvm_library(LLVMCodeGen
RegisterPressure.cpp
RegisterScavenging.cpp
RenameIndependentSubregs.cpp
+ MIRCanonicalizerPass.cpp
RegisterUsageInfo.cpp
RegUsageInfoCollector.cpp
RegUsageInfoPropagate.cpp
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index 588f1791ce3..d4ac5fd040c 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -16,10 +16,10 @@
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cassert>
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index f4ccb4889d3..9d10d1b75f5 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -23,6 +23,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
initializeAtomicExpandPass(Registry);
initializeBranchFolderPassPass(Registry);
initializeBranchRelaxationPass(Registry);
+ initializeCFIInstrInserterPass(Registry);
initializeCodeGenPreparePass(Registry);
initializeCountingFunctionInserterPass(Registry);
initializeDeadMachineInstructionElimPass(Registry);
@@ -30,6 +31,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
initializeDwarfEHPreparePass(Registry);
initializeEarlyIfConverterPass(Registry);
initializeExpandISelPseudosPass(Registry);
+ initializeExpandMemCmpPassPass(Registry);
initializeExpandPostRAPass(Registry);
initializeFEntryInserterPass(Registry);
initializeFinalizeMachineBundlesPass(Registry);
@@ -99,6 +101,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
initializeVirtRegRewriterPass(Registry);
initializeWinEHPreparePass(Registry);
initializeXRayInstrumentationPass(Registry);
+ initializeMIRCanonicalizerPass(Registry);
}
void LLVMInitializeCodeGen(LLVMPassRegistryRef R) {
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 51f2a320b29..d6633a508f5 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -113,6 +113,12 @@ STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "
"of sunken Casts");
STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
"computations were sunk");
+STATISTIC(NumMemoryInstsPhiCreated,
+ "Number of phis created when address "
+ "computations were sunk to memory instructions");
+STATISTIC(NumMemoryInstsSelectCreated,
+ "Number of select created when address "
+ "computations were sunk to memory instructions");
STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads");
STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized");
STATISTIC(NumAndsAdded,
@@ -123,12 +129,6 @@ STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
-STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
-STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
-STATISTIC(NumMemCmpGreaterThanMax,
- "Number of memcmp calls with size greater than max size");
-STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
-
static cl::opt<bool> DisableBranchOpts(
"disable-cgp-branch-opts", cl::Hidden, cl::init(false),
cl::desc("Disable branch optimizations in CodeGenPrepare"));
@@ -189,10 +189,18 @@ EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden,
cl::desc("Enable merging of redundant sexts when one is dominating"
" the other."), cl::init(true));
-static cl::opt<unsigned> MemCmpNumLoadsPerBlock(
- "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
- cl::desc("The number of loads per basic block for inline expansion of "
- "memcmp that is only being compared against zero."));
+static cl::opt<bool> DisableComplexAddrModes(
+ "disable-complex-addr-modes", cl::Hidden, cl::init(true),
+ cl::desc("Disables combining addressing modes with different parts "
+ "in optimizeMemoryInst."));
+
+static cl::opt<bool>
+AddrSinkNewPhis("addr-sink-new-phis", cl::Hidden, cl::init(false),
+ cl::desc("Allow creation of Phis in Address sinking."));
+
+static cl::opt<bool>
+AddrSinkNewSelects("addr-sink-new-select", cl::Hidden, cl::init(false),
+ cl::desc("Allow creation of selects in Address sinking."));
namespace {
@@ -1182,6 +1190,7 @@ static bool SinkCast(CastInst *CI) {
// If we removed all uses, nuke the cast.
if (CI->use_empty()) {
+ salvageDebugInfo(*CI);
CI->eraseFromParent();
MadeChange = true;
}
@@ -1697,699 +1706,6 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
return true;
}
-namespace {
-
-// This class provides helper functions to expand a memcmp library call into an
-// inline expansion.
-class MemCmpExpansion {
- struct ResultBlock {
- BasicBlock *BB = nullptr;
- PHINode *PhiSrc1 = nullptr;
- PHINode *PhiSrc2 = nullptr;
-
- ResultBlock() = default;
- };
-
- CallInst *const CI;
- ResultBlock ResBlock;
- const uint64_t Size;
- unsigned MaxLoadSize;
- uint64_t NumLoadsNonOneByte;
- const uint64_t NumLoadsPerBlock;
- std::vector<BasicBlock *> LoadCmpBlocks;
- BasicBlock *EndBlock;
- PHINode *PhiRes;
- const bool IsUsedForZeroCmp;
- const DataLayout &DL;
- IRBuilder<> Builder;
- // Represents the decomposition in blocks of the expansion. For example,
- // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and
- // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}.
- // TODO(courbet): Involve the target more in this computation. On X86, 7
- // bytes can be done more efficiently with two overlaping 4-byte loads than
- // covering the interval with [{4, 0},{2, 4},{1, 6}}.
- struct LoadEntry {
- LoadEntry(unsigned LoadSize, uint64_t Offset)
- : LoadSize(LoadSize), Offset(Offset) {
- assert(Offset % LoadSize == 0 && "invalid load entry");
- }
-
- uint64_t getGEPIndex() const { return Offset / LoadSize; }
-
- // The size of the load for this block, in bytes.
- const unsigned LoadSize;
- // The offset of this load WRT the base pointer, in bytes.
- const uint64_t Offset;
- };
- SmallVector<LoadEntry, 8> LoadSequence;
-
- void createLoadCmpBlocks();
- void createResultBlock();
- void setupResultBlockPHINodes();
- void setupEndBlockPHINodes();
- Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex);
- void emitLoadCompareBlock(unsigned BlockIndex);
- void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
- unsigned &LoadIndex);
- void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex);
- void emitMemCmpResultBlock();
- Value *getMemCmpExpansionZeroCase();
- Value *getMemCmpEqZeroOneBlock();
- Value *getMemCmpOneBlock();
-
- public:
- MemCmpExpansion(CallInst *CI, uint64_t Size,
- const TargetTransformInfo::MemCmpExpansionOptions &Options,
- unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
- unsigned NumLoadsPerBlock, const DataLayout &DL);
-
- unsigned getNumBlocks();
- uint64_t getNumLoads() const { return LoadSequence.size(); }
-
- Value *getMemCmpExpansion();
-};
-
-} // end anonymous namespace
-
-// Initialize the basic block structure required for expansion of memcmp call
-// with given maximum load size and memcmp size parameter.
-// This structure includes:
-// 1. A list of load compare blocks - LoadCmpBlocks.
-// 2. An EndBlock, split from original instruction point, which is the block to
-// return from.
-// 3. ResultBlock, block to branch to for early exit when a
-// LoadCmpBlock finds a difference.
-MemCmpExpansion::MemCmpExpansion(
- CallInst *const CI, uint64_t Size,
- const TargetTransformInfo::MemCmpExpansionOptions &Options,
- const unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
- const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout)
- : CI(CI),
- Size(Size),
- MaxLoadSize(0),
- NumLoadsNonOneByte(0),
- NumLoadsPerBlock(NumLoadsPerBlock),
- IsUsedForZeroCmp(IsUsedForZeroCmp),
- DL(TheDataLayout),
- Builder(CI) {
- assert(Size > 0 && "zero blocks");
- // Scale the max size down if the target can load more bytes than we need.
- size_t LoadSizeIndex = 0;
- while (LoadSizeIndex < Options.LoadSizes.size() &&
- Options.LoadSizes[LoadSizeIndex] > Size) {
- ++LoadSizeIndex;
- }
- this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex];
- // Compute the decomposition.
- uint64_t CurSize = Size;
- uint64_t Offset = 0;
- while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) {
- const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex];
- assert(LoadSize > 0 && "zero load size");
- const uint64_t NumLoadsForThisSize = CurSize / LoadSize;
- if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {
- // Do not expand if the total number of loads is larger than what the
- // target allows. Note that it's important that we exit before completing
- // the expansion to avoid using a ton of memory to store the expansion for
- // large sizes.
- LoadSequence.clear();
- return;
- }
- if (NumLoadsForThisSize > 0) {
- for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {
- LoadSequence.push_back({LoadSize, Offset});
- Offset += LoadSize;
- }
- if (LoadSize > 1) {
- ++NumLoadsNonOneByte;
- }
- CurSize = CurSize % LoadSize;
- }
- ++LoadSizeIndex;
- }
- assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");
-}
-
-unsigned MemCmpExpansion::getNumBlocks() {
- if (IsUsedForZeroCmp)
- return getNumLoads() / NumLoadsPerBlock +
- (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0);
- return getNumLoads();
-}
-
-void MemCmpExpansion::createLoadCmpBlocks() {
- for (unsigned i = 0; i < getNumBlocks(); i++) {
- BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
- EndBlock->getParent(), EndBlock);
- LoadCmpBlocks.push_back(BB);
- }
-}
-
-void MemCmpExpansion::createResultBlock() {
- ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
- EndBlock->getParent(), EndBlock);
-}
-
-// This function creates the IR instructions for loading and comparing 1 byte.
-// It loads 1 byte from each source of the memcmp parameters with the given
-// GEPIndex. It then subtracts the two loaded values and adds this result to the
-// final phi node for selecting the memcmp result.
-void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
- unsigned GEPIndex) {
- Value *Source1 = CI->getArgOperand(0);
- Value *Source2 = CI->getArgOperand(1);
-
- Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
- Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
- // Cast source to LoadSizeType*.
- if (Source1->getType() != LoadSizeType)
- Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
- if (Source2->getType() != LoadSizeType)
- Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
-
- // Get the base address using the GEPIndex.
- if (GEPIndex != 0) {
- Source1 = Builder.CreateGEP(LoadSizeType, Source1,
- ConstantInt::get(LoadSizeType, GEPIndex));
- Source2 = Builder.CreateGEP(LoadSizeType, Source2,
- ConstantInt::get(LoadSizeType, GEPIndex));
- }
-
- Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
- Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
-
- LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
- LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
- Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
-
- PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]);
-
- if (BlockIndex < (LoadCmpBlocks.size() - 1)) {
- // Early exit branch if difference found to EndBlock. Otherwise, continue to
- // next LoadCmpBlock,
- Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
- ConstantInt::get(Diff->getType(), 0));
- BranchInst *CmpBr =
- BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp);
- Builder.Insert(CmpBr);
- } else {
- // The last block has an unconditional branch to EndBlock.
- BranchInst *CmpBr = BranchInst::Create(EndBlock);
- Builder.Insert(CmpBr);
- }
-}
-
-/// Generate an equality comparison for one or more pairs of loaded values.
-/// This is used in the case where the memcmp() call is compared equal or not
-/// equal to zero.
-Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
- unsigned &LoadIndex) {
- assert(LoadIndex < getNumLoads() &&
- "getCompareLoadPairs() called with no remaining loads");
- std::vector<Value *> XorList, OrList;
- Value *Diff;
-
- const unsigned NumLoads =
- std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock);
-
- // For a single-block expansion, start inserting before the memcmp call.
- if (LoadCmpBlocks.empty())
- Builder.SetInsertPoint(CI);
- else
- Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
-
- Value *Cmp = nullptr;
- // If we have multiple loads per block, we need to generate a composite
- // comparison using xor+or. The type for the combinations is the largest load
- // type.
- IntegerType *const MaxLoadType =
- NumLoads == 1 ? nullptr
- : IntegerType::get(CI->getContext(), MaxLoadSize * 8);
- for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
- const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
-
- IntegerType *LoadSizeType =
- IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
-
- Value *Source1 = CI->getArgOperand(0);
- Value *Source2 = CI->getArgOperand(1);
-
- // Cast source to LoadSizeType*.
- if (Source1->getType() != LoadSizeType)
- Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
- if (Source2->getType() != LoadSizeType)
- Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
-
- // Get the base address using a GEP.
- if (CurLoadEntry.Offset != 0) {
- Source1 = Builder.CreateGEP(
- LoadSizeType, Source1,
- ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
- Source2 = Builder.CreateGEP(
- LoadSizeType, Source2,
- ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
- }
-
- // Get a constant or load a value for each source address.
- Value *LoadSrc1 = nullptr;
- if (auto *Source1C = dyn_cast<Constant>(Source1))
- LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL);
- if (!LoadSrc1)
- LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
-
- Value *LoadSrc2 = nullptr;
- if (auto *Source2C = dyn_cast<Constant>(Source2))
- LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL);
- if (!LoadSrc2)
- LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
-
- if (NumLoads != 1) {
- if (LoadSizeType != MaxLoadType) {
- LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
- LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
- }
- // If we have multiple loads per block, we need to generate a composite
- // comparison using xor+or.
- Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
- Diff = Builder.CreateZExt(Diff, MaxLoadType);
- XorList.push_back(Diff);
- } else {
- // If there's only one load per block, we just compare the loaded values.
- Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2);
- }
- }
-
- auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> {
- std::vector<Value *> OutList;
- for (unsigned i = 0; i < InList.size() - 1; i = i + 2) {
- Value *Or = Builder.CreateOr(InList[i], InList[i + 1]);
- OutList.push_back(Or);
- }
- if (InList.size() % 2 != 0)
- OutList.push_back(InList.back());
- return OutList;
- };
-
- if (!Cmp) {
- // Pairwise OR the XOR results.
- OrList = pairWiseOr(XorList);
-
- // Pairwise OR the OR results until one result left.
- while (OrList.size() != 1) {
- OrList = pairWiseOr(OrList);
- }
- Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0));
- }
-
- return Cmp;
-}
-
-void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
- unsigned &LoadIndex) {
- Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex);
-
- BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
- ? EndBlock
- : LoadCmpBlocks[BlockIndex + 1];
- // Early exit branch if difference found to ResultBlock. Otherwise,
- // continue to next LoadCmpBlock or EndBlock.
- BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
- Builder.Insert(CmpBr);
-
- // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
- // since early exit to ResultBlock was not taken (no difference was found in
- // any of the bytes).
- if (BlockIndex == LoadCmpBlocks.size() - 1) {
- Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
- PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
- }
-}
-
-// This function creates the IR intructions for loading and comparing using the
-// given LoadSize. It loads the number of bytes specified by LoadSize from each
-// source of the memcmp parameters. It then does a subtract to see if there was
-// a difference in the loaded values. If a difference is found, it branches
-// with an early exit to the ResultBlock for calculating which source was
-// larger. Otherwise, it falls through to the either the next LoadCmpBlock or
-// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
-// a special case through emitLoadCompareByteBlock. The special handling can
-// simply subtract the loaded values and add it to the result phi node.
-void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
- // There is one load per block in this case, BlockIndex == LoadIndex.
- const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex];
-
- if (CurLoadEntry.LoadSize == 1) {
- MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex,
- CurLoadEntry.getGEPIndex());
- return;
- }
-
- Type *LoadSizeType =
- IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
- Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
- assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
-
- Value *Source1 = CI->getArgOperand(0);
- Value *Source2 = CI->getArgOperand(1);
-
- Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
- // Cast source to LoadSizeType*.
- if (Source1->getType() != LoadSizeType)
- Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
- if (Source2->getType() != LoadSizeType)
- Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
-
- // Get the base address using a GEP.
- if (CurLoadEntry.Offset != 0) {
- Source1 = Builder.CreateGEP(
- LoadSizeType, Source1,
- ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
- Source2 = Builder.CreateGEP(
- LoadSizeType, Source2,
- ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
- }
-
- // Load LoadSizeType from the base address.
- Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
- Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
-
- if (DL.isLittleEndian()) {
- Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
- Intrinsic::bswap, LoadSizeType);
- LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
- LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
- }
-
- if (LoadSizeType != MaxLoadType) {
- LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
- LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
- }
-
- // Add the loaded values to the phi nodes for calculating memcmp result only
- // if result is not used in a zero equality.
- if (!IsUsedForZeroCmp) {
- ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]);
- ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]);
- }
-
- Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2);
- BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
- ? EndBlock
- : LoadCmpBlocks[BlockIndex + 1];
- // Early exit branch if difference found to ResultBlock. Otherwise, continue
- // to next LoadCmpBlock or EndBlock.
- BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp);
- Builder.Insert(CmpBr);
-
- // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
- // since early exit to ResultBlock was not taken (no difference was found in
- // any of the bytes).
- if (BlockIndex == LoadCmpBlocks.size() - 1) {
- Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
- PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
- }
-}
-
-// This function populates the ResultBlock with a sequence to calculate the
-// memcmp result. It compares the two loaded source values and returns -1 if
-// src1 < src2 and 1 if src1 > src2.
-void MemCmpExpansion::emitMemCmpResultBlock() {
- // Special case: if memcmp result is used in a zero equality, result does not
- // need to be calculated and can simply return 1.
- if (IsUsedForZeroCmp) {
- BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
- Builder.SetInsertPoint(ResBlock.BB, InsertPt);
- Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1);
- PhiRes->addIncoming(Res, ResBlock.BB);
- BranchInst *NewBr = BranchInst::Create(EndBlock);
- Builder.Insert(NewBr);
- return;
- }
- BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
- Builder.SetInsertPoint(ResBlock.BB, InsertPt);
-
- Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1,
- ResBlock.PhiSrc2);
-
- Value *Res =
- Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1),
- ConstantInt::get(Builder.getInt32Ty(), 1));
-
- BranchInst *NewBr = BranchInst::Create(EndBlock);
- Builder.Insert(NewBr);
- PhiRes->addIncoming(Res, ResBlock.BB);
-}
-
-void MemCmpExpansion::setupResultBlockPHINodes() {
- Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
- Builder.SetInsertPoint(ResBlock.BB);
- // Note: this assumes one load per block.
- ResBlock.PhiSrc1 =
- Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1");
- ResBlock.PhiSrc2 =
- Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2");
-}
-
-void MemCmpExpansion::setupEndBlockPHINodes() {
- Builder.SetInsertPoint(&EndBlock->front());
- PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");
-}
-
-Value *MemCmpExpansion::getMemCmpExpansionZeroCase() {
- unsigned LoadIndex = 0;
- // This loop populates each of the LoadCmpBlocks with the IR sequence to
- // handle multiple loads per block.
- for (unsigned I = 0; I < getNumBlocks(); ++I) {
- emitLoadCompareBlockMultipleLoads(I, LoadIndex);
- }
-
- emitMemCmpResultBlock();
- return PhiRes;
-}
-
-/// A memcmp expansion that compares equality with 0 and only has one block of
-/// load and compare can bypass the compare, branch, and phi IR that is required
-/// in the general case.
-Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
- unsigned LoadIndex = 0;
- Value *Cmp = getCompareLoadPairs(0, LoadIndex);
- assert(LoadIndex == getNumLoads() && "some entries were not consumed");
- return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
-}
-
-/// A memcmp expansion that only has one block of load and compare can bypass
-/// the compare, branch, and phi IR that is required in the general case.
-Value *MemCmpExpansion::getMemCmpOneBlock() {
- assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block");
-
- Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
- Value *Source1 = CI->getArgOperand(0);
- Value *Source2 = CI->getArgOperand(1);
-
- // Cast source to LoadSizeType*.
- if (Source1->getType() != LoadSizeType)
- Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
- if (Source2->getType() != LoadSizeType)
- Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
-
- // Load LoadSizeType from the base address.
- Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
- Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
-
- if (DL.isLittleEndian() && Size != 1) {
- Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
- Intrinsic::bswap, LoadSizeType);
- LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
- LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
- }
-
- if (Size < 4) {
- // The i8 and i16 cases don't need compares. We zext the loaded values and
- // subtract them to get the suitable negative, zero, or positive i32 result.
- LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty());
- LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty());
- return Builder.CreateSub(LoadSrc1, LoadSrc2);
- }
-
- // The result of memcmp is negative, zero, or positive, so produce that by
- // subtracting 2 extended compare bits: sub (ugt, ult).
- // If a target prefers to use selects to get -1/0/1, they should be able
- // to transform this later. The inverse transform (going from selects to math)
- // may not be possible in the DAG because the selects got converted into
- // branches before we got there.
- Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2);
- Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2);
- Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty());
- Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty());
- return Builder.CreateSub(ZextUGT, ZextULT);
-}
-
-// This function expands the memcmp call into an inline expansion and returns
-// the memcmp result.
-Value *MemCmpExpansion::getMemCmpExpansion() {
- // A memcmp with zero-comparison with only one block of load and compare does
- // not need to set up any extra blocks. This case could be handled in the DAG,
- // but since we have all of the machinery to flexibly expand any memcpy here,
- // we choose to handle this case too to avoid fragmented lowering.
- if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) {
- BasicBlock *StartBlock = CI->getParent();
- EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
- setupEndBlockPHINodes();
- createResultBlock();
-
- // If return value of memcmp is not used in a zero equality, we need to
- // calculate which source was larger. The calculation requires the
- // two loaded source values of each load compare block.
- // These will be saved in the phi nodes created by setupResultBlockPHINodes.
- if (!IsUsedForZeroCmp) setupResultBlockPHINodes();
-
- // Create the number of required load compare basic blocks.
- createLoadCmpBlocks();
-
- // Update the terminator added by splitBasicBlock to branch to the first
- // LoadCmpBlock.
- StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
- }
-
- Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
- if (IsUsedForZeroCmp)
- return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock()
- : getMemCmpExpansionZeroCase();
-
- // TODO: Handle more than one load pair per block in getMemCmpOneBlock().
- if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock();
-
- for (unsigned I = 0; I < getNumBlocks(); ++I) {
- emitLoadCompareBlock(I);
- }
-
- emitMemCmpResultBlock();
- return PhiRes;
-}
-
-// This function checks to see if an expansion of memcmp can be generated.
-// It checks for constant compare size that is less than the max inline size.
-// If an expansion cannot occur, returns false to leave as a library call.
-// Otherwise, the library call is replaced with a new IR instruction sequence.
-/// We want to transform:
-/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
-/// To:
-/// loadbb:
-/// %0 = bitcast i32* %buffer2 to i8*
-/// %1 = bitcast i32* %buffer1 to i8*
-/// %2 = bitcast i8* %1 to i64*
-/// %3 = bitcast i8* %0 to i64*
-/// %4 = load i64, i64* %2
-/// %5 = load i64, i64* %3
-/// %6 = call i64 @llvm.bswap.i64(i64 %4)
-/// %7 = call i64 @llvm.bswap.i64(i64 %5)
-/// %8 = sub i64 %6, %7
-/// %9 = icmp ne i64 %8, 0
-/// br i1 %9, label %res_block, label %loadbb1
-/// res_block: ; preds = %loadbb2,
-/// %loadbb1, %loadbb
-/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]
-/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]
-/// %10 = icmp ult i64 %phi.src1, %phi.src2
-/// %11 = select i1 %10, i32 -1, i32 1
-/// br label %endblock
-/// loadbb1: ; preds = %loadbb
-/// %12 = bitcast i32* %buffer2 to i8*
-/// %13 = bitcast i32* %buffer1 to i8*
-/// %14 = bitcast i8* %13 to i32*
-/// %15 = bitcast i8* %12 to i32*
-/// %16 = getelementptr i32, i32* %14, i32 2
-/// %17 = getelementptr i32, i32* %15, i32 2
-/// %18 = load i32, i32* %16
-/// %19 = load i32, i32* %17
-/// %20 = call i32 @llvm.bswap.i32(i32 %18)
-/// %21 = call i32 @llvm.bswap.i32(i32 %19)
-/// %22 = zext i32 %20 to i64
-/// %23 = zext i32 %21 to i64
-/// %24 = sub i64 %22, %23
-/// %25 = icmp ne i64 %24, 0
-/// br i1 %25, label %res_block, label %loadbb2
-/// loadbb2: ; preds = %loadbb1
-/// %26 = bitcast i32* %buffer2 to i8*
-/// %27 = bitcast i32* %buffer1 to i8*
-/// %28 = bitcast i8* %27 to i16*
-/// %29 = bitcast i8* %26 to i16*
-/// %30 = getelementptr i16, i16* %28, i16 6
-/// %31 = getelementptr i16, i16* %29, i16 6
-/// %32 = load i16, i16* %30
-/// %33 = load i16, i16* %31
-/// %34 = call i16 @llvm.bswap.i16(i16 %32)
-/// %35 = call i16 @llvm.bswap.i16(i16 %33)
-/// %36 = zext i16 %34 to i64
-/// %37 = zext i16 %35 to i64
-/// %38 = sub i64 %36, %37
-/// %39 = icmp ne i64 %38, 0
-/// br i1 %39, label %res_block, label %loadbb3
-/// loadbb3: ; preds = %loadbb2
-/// %40 = bitcast i32* %buffer2 to i8*
-/// %41 = bitcast i32* %buffer1 to i8*
-/// %42 = getelementptr i8, i8* %41, i8 14
-/// %43 = getelementptr i8, i8* %40, i8 14
-/// %44 = load i8, i8* %42
-/// %45 = load i8, i8* %43
-/// %46 = zext i8 %44 to i32
-/// %47 = zext i8 %45 to i32
-/// %48 = sub i32 %46, %47
-/// br label %endblock
-/// endblock: ; preds = %res_block,
-/// %loadbb3
-/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
-/// ret i32 %phi.res
-static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
- const TargetLowering *TLI, const DataLayout *DL) {
- NumMemCmpCalls++;
-
- // Early exit from expansion if -Oz.
- if (CI->getFunction()->optForMinSize())
- return false;
-
- // Early exit from expansion if size is not a constant.
- ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
- if (!SizeCast) {
- NumMemCmpNotConstant++;
- return false;
- }
- const uint64_t SizeVal = SizeCast->getZExtValue();
-
- if (SizeVal == 0) {
- return false;
- }
-
- // TTI call to check if target would like to expand memcmp. Also, get the
- // available load sizes.
- const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
- const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp);
- if (!Options) return false;
-
- const unsigned MaxNumLoads =
- TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize());
-
- MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads,
- IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL);
-
- // Don't expand if this will require more loads than desired by the target.
- if (Expansion.getNumLoads() == 0) {
- NumMemCmpGreaterThanMax++;
- return false;
- }
-
- NumMemCmpInlined++;
-
- Value *Res = Expansion.getMemCmpExpansion();
-
- // Replace call with result of expansion and erase call.
- CI->replaceAllUsesWith(Res);
- CI->eraseFromParent();
-
- return true;
-}
-
bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
BasicBlock *BB = CI->getParent();
@@ -2542,12 +1858,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
return true;
}
- LibFunc Func;
- if (TLInfo->getLibFunc(ImmutableCallSite(CI), Func) &&
- Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TLI, DL)) {
- ModifiedDT = true;
- return true;
- }
return false;
}
@@ -3377,8 +2687,65 @@ private:
Value *PromotedOperand) const;
};
+/// \brief Keep track of simplification of Phi nodes.
+/// Accept the set of all phi nodes and erase phi node from this set
+/// if it is simplified.
+class SimplificationTracker {
+ DenseMap<Value *, Value *> Storage;
+ const SimplifyQuery &SQ;
+ SmallPtrSetImpl<PHINode *> &AllPhiNodes;
+ SmallPtrSetImpl<SelectInst *> &AllSelectNodes;
+
+public:
+ SimplificationTracker(const SimplifyQuery &sq,
+ SmallPtrSetImpl<PHINode *> &APN,
+ SmallPtrSetImpl<SelectInst *> &ASN)
+ : SQ(sq), AllPhiNodes(APN), AllSelectNodes(ASN) {}
+
+ Value *Get(Value *V) {
+ do {
+ auto SV = Storage.find(V);
+ if (SV == Storage.end())
+ return V;
+ V = SV->second;
+ } while (true);
+ }
+
+ Value *Simplify(Value *Val) {
+ SmallVector<Value *, 32> WorkList;
+ SmallPtrSet<Value *, 32> Visited;
+ WorkList.push_back(Val);
+ while (!WorkList.empty()) {
+ auto P = WorkList.pop_back_val();
+ if (!Visited.insert(P).second)
+ continue;
+ if (auto *PI = dyn_cast<Instruction>(P))
+ if (Value *V = SimplifyInstruction(cast<Instruction>(PI), SQ)) {
+ for (auto *U : PI->users())
+ WorkList.push_back(cast<Value>(U));
+ Put(PI, V);
+ PI->replaceAllUsesWith(V);
+ if (auto *PHI = dyn_cast<PHINode>(PI))
+ AllPhiNodes.erase(PHI);
+ if (auto *Select = dyn_cast<SelectInst>(PI))
+ AllSelectNodes.erase(Select);
+ PI->eraseFromParent();
+ }
+ }
+ return Get(Val);
+ }
+
+ void Put(Value *From, Value *To) {
+ Storage.insert({ From, To });
+ }
+};
+
/// \brief A helper class for combining addressing modes.
class AddressingModeCombiner {
+ typedef std::pair<Value *, BasicBlock *> ValueInBB;
+ typedef DenseMap<ValueInBB, Value *> FoldAddrToValueMapping;
+ typedef std::pair<PHINode *, PHINode *> PHIPair;
+
private:
/// The addressing modes we've collected.
SmallVector<ExtAddrMode, 16> AddrModes;
@@ -3389,7 +2756,19 @@ private:
/// Are the AddrModes that we have all just equal to their original values?
bool AllAddrModesTrivial = true;
+ /// Common Type for all different fields in addressing modes.
+ Type *CommonType;
+
+ /// SimplifyQuery for simplifyInstruction utility.
+ const SimplifyQuery &SQ;
+
+ /// Original Address.
+ ValueInBB Original;
+
public:
+ AddressingModeCombiner(const SimplifyQuery &_SQ, ValueInBB OriginalValue)
+ : CommonType(nullptr), SQ(_SQ), Original(OriginalValue) {}
+
/// \brief Get the combined AddrMode
const ExtAddrMode &getAddrMode() const {
return AddrModes[0];
@@ -3457,12 +2836,356 @@ public:
if (AllAddrModesTrivial)
return false;
- // TODO: Combine multiple AddrModes by inserting a select or phi for the
- // field in which the AddrModes differ.
- return false;
+ if (DisableComplexAddrModes)
+ return false;
+
+ // For now we support only different base registers.
+ // TODO: enable others.
+ if (DifferentField != ExtAddrMode::BaseRegField)
+ return false;
+
+ // Build a map between <original value, basic block where we saw it> to
+ // value of base register.
+ FoldAddrToValueMapping Map;
+ initializeMap(Map);
+
+ Value *CommonValue = findCommon(Map);
+ if (CommonValue)
+ AddrModes[0].BaseReg = CommonValue;
+ return CommonValue != nullptr;
+ }
+
+private:
+ /// \brief Initialize Map with anchor values. For address seen in some BB
+ /// we set the value of different field saw in this address.
+ /// If address is not an instruction than basic block is set to null.
+ /// At the same time we find a common type for different field we will
+ /// use to create new Phi/Select nodes. Keep it in CommonType field.
+ void initializeMap(FoldAddrToValueMapping &Map) {
+ // Keep track of keys where the value is null. We will need to replace it
+ // with constant null when we know the common type.
+ SmallVector<ValueInBB, 2> NullValue;
+ for (auto &AM : AddrModes) {
+ BasicBlock *BB = nullptr;
+ if (Instruction *I = dyn_cast<Instruction>(AM.OriginalValue))
+ BB = I->getParent();
+
+ // For now we support only base register as different field.
+ // TODO: Enable others.
+ Value *DV = AM.BaseReg;
+ if (DV) {
+ if (CommonType)
+ assert(CommonType == DV->getType() && "Different types detected!");
+ else
+ CommonType = DV->getType();
+ Map[{ AM.OriginalValue, BB }] = DV;
+ } else {
+ NullValue.push_back({ AM.OriginalValue, BB });
+ }
+ }
+ assert(CommonType && "At least one non-null value must be!");
+ for (auto VIBB : NullValue)
+ Map[VIBB] = Constant::getNullValue(CommonType);
+ }
+
+ /// \brief We have mapping between value A and basic block where value A
+ /// seen to other value B where B was a field in addressing mode represented
+ /// by A. Also we have an original value C representin an address in some
+ /// basic block. Traversing from C through phi and selects we ended up with
+ /// A's in a map. This utility function tries to find a value V which is a
+ /// field in addressing mode C and traversing through phi nodes and selects
+ /// we will end up in corresponded values B in a map.
+ /// The utility will create a new Phi/Selects if needed.
+ // The simple example looks as follows:
+ // BB1:
+ // p1 = b1 + 40
+ // br cond BB2, BB3
+ // BB2:
+ // p2 = b2 + 40
+ // br BB3
+ // BB3:
+ // p = phi [p1, BB1], [p2, BB2]
+ // v = load p
+ // Map is
+ // <p1, BB1> -> b1
+ // <p2, BB2> -> b2
+ // Request is
+ // <p, BB3> -> ?
+ // The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3
+ Value *findCommon(FoldAddrToValueMapping &Map) {
+ // Tracks of new created Phi nodes.
+ SmallPtrSet<PHINode *, 32> NewPhiNodes;
+ // Tracks of new created Select nodes.
+ SmallPtrSet<SelectInst *, 32> NewSelectNodes;
+ // Tracks the simplification of new created phi nodes. The reason we use
+ // this mapping is because we will add new created Phi nodes in AddrToBase.
+ // Simplification of Phi nodes is recursive, so some Phi node may
+ // be simplified after we added it to AddrToBase.
+ // Using this mapping we can find the current value in AddrToBase.
+ SimplificationTracker ST(SQ, NewPhiNodes, NewSelectNodes);
+
+ // First step, DFS to create PHI nodes for all intermediate blocks.
+ // Also fill traverse order for the second step.
+ SmallVector<ValueInBB, 32> TraverseOrder;
+ InsertPlaceholders(Map, TraverseOrder, NewPhiNodes, NewSelectNodes);
+
+ // Second Step, fill new nodes by merged values and simplify if possible.
+ FillPlaceholders(Map, TraverseOrder, ST);
+
+ if (!AddrSinkNewSelects && NewSelectNodes.size() > 0) {
+ DestroyNodes(NewPhiNodes);
+ DestroyNodes(NewSelectNodes);
+ return nullptr;
+ }
+
+ // Now we'd like to match New Phi nodes to existed ones.
+ unsigned PhiNotMatchedCount = 0;
+ if (!MatchPhiSet(NewPhiNodes, ST, AddrSinkNewPhis, PhiNotMatchedCount)) {
+ DestroyNodes(NewPhiNodes);
+ DestroyNodes(NewSelectNodes);
+ return nullptr;
+ }
+
+ auto *Result = ST.Get(Map.find(Original)->second);
+ if (Result) {
+ NumMemoryInstsPhiCreated += NewPhiNodes.size() + PhiNotMatchedCount;
+ NumMemoryInstsSelectCreated += NewSelectNodes.size();
+ }
+ return Result;
+ }
+
+ /// \brief Destroy nodes from a set.
+ template <typename T> void DestroyNodes(SmallPtrSetImpl<T *> &Instructions) {
+ // For safe erasing, replace the Phi with dummy value first.
+ auto Dummy = UndefValue::get(CommonType);
+ for (auto I : Instructions) {
+ I->replaceAllUsesWith(Dummy);
+ I->eraseFromParent();
+ }
+ }
+
+ /// \brief Try to match PHI node to Candidate.
+ /// Matcher tracks the matched Phi nodes.
+ bool MatchPhiNode(PHINode *PHI, PHINode *Candidate,
+ DenseSet<PHIPair> &Matcher,
+ SmallPtrSetImpl<PHINode *> &PhiNodesToMatch) {
+ SmallVector<PHIPair, 8> WorkList;
+ Matcher.insert({ PHI, Candidate });
+ WorkList.push_back({ PHI, Candidate });
+ SmallSet<PHIPair, 8> Visited;
+ while (!WorkList.empty()) {
+ auto Item = WorkList.pop_back_val();
+ if (!Visited.insert(Item).second)
+ continue;
+ // We iterate over all incoming values to Phi to compare them.
+ // If values are different and both of them Phi and the first one is a
+ // Phi we added (subject to match) and both of them is in the same basic
+ // block then we can match our pair if values match. So we state that
+ // these values match and add it to work list to verify that.
+ for (auto B : Item.first->blocks()) {
+ Value *FirstValue = Item.first->getIncomingValueForBlock(B);
+ Value *SecondValue = Item.second->getIncomingValueForBlock(B);
+ if (FirstValue == SecondValue)
+ continue;
+
+ PHINode *FirstPhi = dyn_cast<PHINode>(FirstValue);
+ PHINode *SecondPhi = dyn_cast<PHINode>(SecondValue);
+
+ // One of them is not Phi or
+ // The first one is not Phi node from the set we'd like to match or
+ // Phi nodes from different basic blocks then
+ // we will not be able to match.
+ if (!FirstPhi || !SecondPhi || !PhiNodesToMatch.count(FirstPhi) ||
+ FirstPhi->getParent() != SecondPhi->getParent())
+ return false;
+
+ // If we already matched them then continue.
+ if (Matcher.count({ FirstPhi, SecondPhi }))
+ continue;
+ // So the values are different and does not match. So we need them to
+ // match.
+ Matcher.insert({ FirstPhi, SecondPhi });
+ // But me must check it.
+ WorkList.push_back({ FirstPhi, SecondPhi });
+ }
+ }
+ return true;
}
-};
+ /// \brief For the given set of PHI nodes try to find their equivalents.
+ /// Returns false if this matching fails and creation of new Phi is disabled.
+ bool MatchPhiSet(SmallPtrSetImpl<PHINode *> &PhiNodesToMatch,
+ SimplificationTracker &ST, bool AllowNewPhiNodes,
+ unsigned &PhiNotMatchedCount) {
+ DenseSet<PHIPair> Matched;
+ SmallPtrSet<PHINode *, 8> WillNotMatch;
+ while (PhiNodesToMatch.size()) {
+ PHINode *PHI = *PhiNodesToMatch.begin();
+
+ // Add us, if no Phi nodes in the basic block we do not match.
+ WillNotMatch.clear();
+ WillNotMatch.insert(PHI);
+
+ // Traverse all Phis until we found equivalent or fail to do that.
+ bool IsMatched = false;
+ for (auto &P : PHI->getParent()->phis()) {
+ if (&P == PHI)
+ continue;
+ if ((IsMatched = MatchPhiNode(PHI, &P, Matched, PhiNodesToMatch)))
+ break;
+ // If it does not match, collect all Phi nodes from matcher.
+ // if we end up with no match, them all these Phi nodes will not match
+ // later.
+ for (auto M : Matched)
+ WillNotMatch.insert(M.first);
+ Matched.clear();
+ }
+ if (IsMatched) {
+ // Replace all matched values and erase them.
+ for (auto MV : Matched) {
+ MV.first->replaceAllUsesWith(MV.second);
+ PhiNodesToMatch.erase(MV.first);
+ ST.Put(MV.first, MV.second);
+ MV.first->eraseFromParent();
+ }
+ Matched.clear();
+ continue;
+ }
+ // If we are not allowed to create new nodes then bail out.
+ if (!AllowNewPhiNodes)
+ return false;
+ // Just remove all seen values in matcher. They will not match anything.
+ PhiNotMatchedCount += WillNotMatch.size();
+ for (auto *P : WillNotMatch)
+ PhiNodesToMatch.erase(P);
+ }
+ return true;
+ }
+ /// \brief Fill the placeholder with values from predecessors and simplify it.
+ void FillPlaceholders(FoldAddrToValueMapping &Map,
+ SmallVectorImpl<ValueInBB> &TraverseOrder,
+ SimplificationTracker &ST) {
+ while (!TraverseOrder.empty()) {
+ auto Current = TraverseOrder.pop_back_val();
+ assert(Map.find(Current) != Map.end() && "No node to fill!!!");
+ Value *CurrentValue = Current.first;
+ BasicBlock *CurrentBlock = Current.second;
+ Value *V = Map[Current];
+
+ if (SelectInst *Select = dyn_cast<SelectInst>(V)) {
+ // CurrentValue also must be Select.
+ auto *CurrentSelect = cast<SelectInst>(CurrentValue);
+ auto *TrueValue = CurrentSelect->getTrueValue();
+ ValueInBB TrueItem = { TrueValue, isa<Instruction>(TrueValue)
+ ? CurrentBlock
+ : nullptr };
+ assert(Map.find(TrueItem) != Map.end() && "No True Value!");
+ Select->setTrueValue(Map[TrueItem]);
+ auto *FalseValue = CurrentSelect->getFalseValue();
+ ValueInBB FalseItem = { FalseValue, isa<Instruction>(FalseValue)
+ ? CurrentBlock
+ : nullptr };
+ assert(Map.find(FalseItem) != Map.end() && "No False Value!");
+ Select->setFalseValue(Map[FalseItem]);
+ } else {
+ // Must be a Phi node then.
+ PHINode *PHI = cast<PHINode>(V);
+ // Fill the Phi node with values from predecessors.
+ bool IsDefinedInThisBB =
+ cast<Instruction>(CurrentValue)->getParent() == CurrentBlock;
+ auto *CurrentPhi = dyn_cast<PHINode>(CurrentValue);
+ for (auto B : predecessors(CurrentBlock)) {
+ Value *PV = IsDefinedInThisBB
+ ? CurrentPhi->getIncomingValueForBlock(B)
+ : CurrentValue;
+ ValueInBB item = { PV, isa<Instruction>(PV) ? B : nullptr };
+ assert(Map.find(item) != Map.end() && "No predecessor Value!");
+ PHI->addIncoming(ST.Get(Map[item]), B);
+ }
+ }
+ // Simplify if possible.
+ Map[Current] = ST.Simplify(V);
+ }
+ }
+
+ /// Starting from value recursively iterates over predecessors up to known
+ /// ending values represented in a map. For each traversed block inserts
+ /// a placeholder Phi or Select.
+ /// Reports all new created Phi/Select nodes by adding them to set.
+ /// Also reports and order in what basic blocks have been traversed.
+ void InsertPlaceholders(FoldAddrToValueMapping &Map,
+ SmallVectorImpl<ValueInBB> &TraverseOrder,
+ SmallPtrSetImpl<PHINode *> &NewPhiNodes,
+ SmallPtrSetImpl<SelectInst *> &NewSelectNodes) {
+ SmallVector<ValueInBB, 32> Worklist;
+ assert((isa<PHINode>(Original.first) || isa<SelectInst>(Original.first)) &&
+ "Address must be a Phi or Select node");
+ auto *Dummy = UndefValue::get(CommonType);
+ Worklist.push_back(Original);
+ while (!Worklist.empty()) {
+ auto Current = Worklist.pop_back_val();
+ // If value is not an instruction it is something global, constant,
+ // parameter and we can say that this value is observable in any block.
+ // Set block to null to denote it.
+ // Also please take into account that it is how we build anchors.
+ if (!isa<Instruction>(Current.first))
+ Current.second = nullptr;
+ // if it is already visited or it is an ending value then skip it.
+ if (Map.find(Current) != Map.end())
+ continue;
+ TraverseOrder.push_back(Current);
+
+ Value *CurrentValue = Current.first;
+ BasicBlock *CurrentBlock = Current.second;
+ // CurrentValue must be a Phi node or select. All others must be covered
+ // by anchors.
+ Instruction *CurrentI = cast<Instruction>(CurrentValue);
+ bool IsDefinedInThisBB = CurrentI->getParent() == CurrentBlock;
+
+ unsigned PredCount =
+ std::distance(pred_begin(CurrentBlock), pred_end(CurrentBlock));
+ // if Current Value is not defined in this basic block we are interested
+ // in values in predecessors.
+ if (!IsDefinedInThisBB) {
+ assert(PredCount && "Unreachable block?!");
+ PHINode *PHI = PHINode::Create(CommonType, PredCount, "sunk_phi",
+ &CurrentBlock->front());
+ Map[Current] = PHI;
+ NewPhiNodes.insert(PHI);
+ // Add all predecessors in work list.
+ for (auto B : predecessors(CurrentBlock))
+ Worklist.push_back({ CurrentValue, B });
+ continue;
+ }
+ // Value is defined in this basic block.
+ if (SelectInst *OrigSelect = dyn_cast<SelectInst>(CurrentI)) {
+ // Is it OK to get metadata from OrigSelect?!
+ // Create a Select placeholder with dummy value.
+ SelectInst *Select =
+ SelectInst::Create(OrigSelect->getCondition(), Dummy, Dummy,
+ OrigSelect->getName(), OrigSelect, OrigSelect);
+ Map[Current] = Select;
+ NewSelectNodes.insert(Select);
+ // We are interested in True and False value in this basic block.
+ Worklist.push_back({ OrigSelect->getTrueValue(), CurrentBlock });
+ Worklist.push_back({ OrigSelect->getFalseValue(), CurrentBlock });
+ } else {
+ // It must be a Phi node then.
+ auto *CurrentPhi = cast<PHINode>(CurrentI);
+ // Create new Phi node for merge of bases.
+ assert(PredCount && "Unreachable block?!");
+ PHINode *PHI = PHINode::Create(CommonType, PredCount, "sunk_phi",
+ &CurrentBlock->front());
+ Map[Current] = PHI;
+ NewPhiNodes.insert(PHI);
+
+ // Add all predecessors in work list.
+ for (auto B : predecessors(CurrentBlock))
+ Worklist.push_back({ CurrentPhi->getIncomingValueForBlock(B), B });
+ }
+ }
+ }
+};
} // end anonymous namespace
/// Try adding ScaleReg*Scale to the current addressing mode.
@@ -4555,7 +4278,8 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
// the graph are compatible.
bool PhiOrSelectSeen = false;
SmallVector<Instruction*, 16> AddrModeInsts;
- AddressingModeCombiner AddrModes;
+ const SimplifyQuery SQ(*DL, TLInfo);
+ AddressingModeCombiner AddrModes(SQ, { Addr, MemoryInst->getParent() });
TypePromotionTransaction TPT(RemovedInsts);
TypePromotionTransaction::ConstRestorationPt LastKnownGood =
TPT.getRestorationPoint();
@@ -6715,7 +6439,7 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
Instruction *Insn = &*BI++;
DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
// Leave dbg.values that refer to an alloca alone. These
- // instrinsics describe the address of a variable (= the alloca)
+ // intrinsics describe the address of a variable (= the alloca)
// being taken. They should not be moved next to the alloca
// (and to the beginning of the scope), but rather stay close to
// where said address is used.
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index a791c01c48b..9ef172274c1 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -26,11 +26,11 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cassert>
diff --git a/lib/CodeGen/DFAPacketizer.cpp b/lib/CodeGen/DFAPacketizer.cpp
index cf21316ec22..0123afb4cd8 100644
--- a/lib/CodeGen/DFAPacketizer.cpp
+++ b/lib/CodeGen/DFAPacketizer.cpp
@@ -29,12 +29,12 @@
#include "llvm/CodeGen/MachineInstrBundle.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
#include <cassert>
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index 91d18e2bcaa..ef1452087f2 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -15,10 +15,10 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
using namespace llvm;
diff --git a/lib/CodeGen/DetectDeadLanes.cpp b/lib/CodeGen/DetectDeadLanes.cpp
index ab9a0592e01..2613471714e 100644
--- a/lib/CodeGen/DetectDeadLanes.cpp
+++ b/lib/CodeGen/DetectDeadLanes.cpp
@@ -34,12 +34,12 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/PassRegistry.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index 402afe75b14..cb6c3ae04c7 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -30,10 +30,10 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineTraceMetrics.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index e272d25047e..28f716ee6c2 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -15,10 +15,10 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
using namespace llvm;
diff --git a/lib/CodeGen/ExpandMemCmp.cpp b/lib/CodeGen/ExpandMemCmp.cpp
new file mode 100644
index 00000000000..c5910c18d89
--- /dev/null
+++ b/lib/CodeGen/ExpandMemCmp.cpp
@@ -0,0 +1,828 @@
+//===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to partially inline the fast path of well-known library
+// functions, such as using square-root instructions for cases where sqrt()
+// does not need to set errno.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "expandmemcmp"
+
+STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
+STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
+STATISTIC(NumMemCmpGreaterThanMax,
+ "Number of memcmp calls with size greater than max size");
+STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
+
+static cl::opt<unsigned> MemCmpNumLoadsPerBlock(
+ "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
+ cl::desc("The number of loads per basic block for inline expansion of "
+ "memcmp that is only being compared against zero."));
+
+namespace {
+
+
+// This class provides helper functions to expand a memcmp library call into an
+// inline expansion.
+class MemCmpExpansion {
+ struct ResultBlock {
+ BasicBlock *BB = nullptr;
+ PHINode *PhiSrc1 = nullptr;
+ PHINode *PhiSrc2 = nullptr;
+
+ ResultBlock() = default;
+ };
+
+ CallInst *const CI;
+ ResultBlock ResBlock;
+ const uint64_t Size;
+ unsigned MaxLoadSize;
+ uint64_t NumLoadsNonOneByte;
+ const uint64_t NumLoadsPerBlock;
+ std::vector<BasicBlock *> LoadCmpBlocks;
+ BasicBlock *EndBlock;
+ PHINode *PhiRes;
+ const bool IsUsedForZeroCmp;
+ const DataLayout &DL;
+ IRBuilder<> Builder;
+ // Represents the decomposition in blocks of the expansion. For example,
+ // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and
+ // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}.
+ // TODO(courbet): Involve the target more in this computation. On X86, 7
+ // bytes can be done more efficiently with two overlaping 4-byte loads than
+ // covering the interval with [{4, 0},{2, 4},{1, 6}}.
+ struct LoadEntry {
+ LoadEntry(unsigned LoadSize, uint64_t Offset)
+ : LoadSize(LoadSize), Offset(Offset) {
+ assert(Offset % LoadSize == 0 && "invalid load entry");
+ }
+
+ uint64_t getGEPIndex() const { return Offset / LoadSize; }
+
+ // The size of the load for this block, in bytes.
+ const unsigned LoadSize;
+ // The offset of this load WRT the base pointer, in bytes.
+ const uint64_t Offset;
+ };
+ SmallVector<LoadEntry, 8> LoadSequence;
+
+ void createLoadCmpBlocks();
+ void createResultBlock();
+ void setupResultBlockPHINodes();
+ void setupEndBlockPHINodes();
+ Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex);
+ void emitLoadCompareBlock(unsigned BlockIndex);
+ void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
+ unsigned &LoadIndex);
+ void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex);
+ void emitMemCmpResultBlock();
+ Value *getMemCmpExpansionZeroCase();
+ Value *getMemCmpEqZeroOneBlock();
+ Value *getMemCmpOneBlock();
+
+ public:
+ MemCmpExpansion(CallInst *CI, uint64_t Size,
+ const TargetTransformInfo::MemCmpExpansionOptions &Options,
+ unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
+ unsigned NumLoadsPerBlock, const DataLayout &DL);
+
+ unsigned getNumBlocks();
+ uint64_t getNumLoads() const { return LoadSequence.size(); }
+
+ Value *getMemCmpExpansion();
+};
+
+// Initialize the basic block structure required for expansion of memcmp call
+// with given maximum load size and memcmp size parameter.
+// This structure includes:
+// 1. A list of load compare blocks - LoadCmpBlocks.
+// 2. An EndBlock, split from original instruction point, which is the block to
+// return from.
+// 3. ResultBlock, block to branch to for early exit when a
+// LoadCmpBlock finds a difference.
+MemCmpExpansion::MemCmpExpansion(
+ CallInst *const CI, uint64_t Size,
+ const TargetTransformInfo::MemCmpExpansionOptions &Options,
+ const unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
+ const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout)
+ : CI(CI),
+ Size(Size),
+ MaxLoadSize(0),
+ NumLoadsNonOneByte(0),
+ NumLoadsPerBlock(NumLoadsPerBlock),
+ IsUsedForZeroCmp(IsUsedForZeroCmp),
+ DL(TheDataLayout),
+ Builder(CI) {
+ assert(Size > 0 && "zero blocks");
+ // Scale the max size down if the target can load more bytes than we need.
+ size_t LoadSizeIndex = 0;
+ while (LoadSizeIndex < Options.LoadSizes.size() &&
+ Options.LoadSizes[LoadSizeIndex] > Size) {
+ ++LoadSizeIndex;
+ }
+ this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex];
+ // Compute the decomposition.
+ uint64_t CurSize = Size;
+ uint64_t Offset = 0;
+ while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) {
+ const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex];
+ assert(LoadSize > 0 && "zero load size");
+ const uint64_t NumLoadsForThisSize = CurSize / LoadSize;
+ if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {
+ // Do not expand if the total number of loads is larger than what the
+ // target allows. Note that it's important that we exit before completing
+ // the expansion to avoid using a ton of memory to store the expansion for
+ // large sizes.
+ LoadSequence.clear();
+ return;
+ }
+ if (NumLoadsForThisSize > 0) {
+ for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {
+ LoadSequence.push_back({LoadSize, Offset});
+ Offset += LoadSize;
+ }
+ if (LoadSize > 1) {
+ ++NumLoadsNonOneByte;
+ }
+ CurSize = CurSize % LoadSize;
+ }
+ ++LoadSizeIndex;
+ }
+ assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");
+}
+
+unsigned MemCmpExpansion::getNumBlocks() {
+ if (IsUsedForZeroCmp)
+ return getNumLoads() / NumLoadsPerBlock +
+ (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0);
+ return getNumLoads();
+}
+
+void MemCmpExpansion::createLoadCmpBlocks() {
+ for (unsigned i = 0; i < getNumBlocks(); i++) {
+ BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
+ EndBlock->getParent(), EndBlock);
+ LoadCmpBlocks.push_back(BB);
+ }
+}
+
+void MemCmpExpansion::createResultBlock() {
+ ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
+ EndBlock->getParent(), EndBlock);
+}
+
+// This function creates the IR instructions for loading and comparing 1 byte.
+// It loads 1 byte from each source of the memcmp parameters with the given
+// GEPIndex. It then subtracts the two loaded values and adds this result to the
+// final phi node for selecting the memcmp result.
+void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
+ unsigned GEPIndex) {
+ Value *Source1 = CI->getArgOperand(0);
+ Value *Source2 = CI->getArgOperand(1);
+
+ Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
+ Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
+ // Cast source to LoadSizeType*.
+ if (Source1->getType() != LoadSizeType)
+ Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+ if (Source2->getType() != LoadSizeType)
+ Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+ // Get the base address using the GEPIndex.
+ if (GEPIndex != 0) {
+ Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+ ConstantInt::get(LoadSizeType, GEPIndex));
+ Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+ ConstantInt::get(LoadSizeType, GEPIndex));
+ }
+
+ Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+ Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+ LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
+ LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
+ Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
+
+ PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]);
+
+ if (BlockIndex < (LoadCmpBlocks.size() - 1)) {
+ // Early exit branch if difference found to EndBlock. Otherwise, continue to
+ // next LoadCmpBlock,
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
+ ConstantInt::get(Diff->getType(), 0));
+ BranchInst *CmpBr =
+ BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp);
+ Builder.Insert(CmpBr);
+ } else {
+ // The last block has an unconditional branch to EndBlock.
+ BranchInst *CmpBr = BranchInst::Create(EndBlock);
+ Builder.Insert(CmpBr);
+ }
+}
+
+/// Generate an equality comparison for one or more pairs of loaded values.
+/// This is used in the case where the memcmp() call is compared equal or not
+/// equal to zero.
+Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
+ unsigned &LoadIndex) {
+ assert(LoadIndex < getNumLoads() &&
+ "getCompareLoadPairs() called with no remaining loads");
+ std::vector<Value *> XorList, OrList;
+ Value *Diff;
+
+ const unsigned NumLoads =
+ std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock);
+
+ // For a single-block expansion, start inserting before the memcmp call.
+ if (LoadCmpBlocks.empty())
+ Builder.SetInsertPoint(CI);
+ else
+ Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
+
+ Value *Cmp = nullptr;
+ // If we have multiple loads per block, we need to generate a composite
+ // comparison using xor+or. The type for the combinations is the largest load
+ // type.
+ IntegerType *const MaxLoadType =
+ NumLoads == 1 ? nullptr
+ : IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+ for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
+ const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
+
+ IntegerType *LoadSizeType =
+ IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
+
+ Value *Source1 = CI->getArgOperand(0);
+ Value *Source2 = CI->getArgOperand(1);
+
+ // Cast source to LoadSizeType*.
+ if (Source1->getType() != LoadSizeType)
+ Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+ if (Source2->getType() != LoadSizeType)
+ Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+ // Get the base address using a GEP.
+ if (CurLoadEntry.Offset != 0) {
+ Source1 = Builder.CreateGEP(
+ LoadSizeType, Source1,
+ ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
+ Source2 = Builder.CreateGEP(
+ LoadSizeType, Source2,
+ ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
+ }
+
+ // Get a constant or load a value for each source address.
+ Value *LoadSrc1 = nullptr;
+ if (auto *Source1C = dyn_cast<Constant>(Source1))
+ LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL);
+ if (!LoadSrc1)
+ LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+
+ Value *LoadSrc2 = nullptr;
+ if (auto *Source2C = dyn_cast<Constant>(Source2))
+ LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL);
+ if (!LoadSrc2)
+ LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+ if (NumLoads != 1) {
+ if (LoadSizeType != MaxLoadType) {
+ LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
+ LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
+ }
+ // If we have multiple loads per block, we need to generate a composite
+ // comparison using xor+or.
+ Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
+ Diff = Builder.CreateZExt(Diff, MaxLoadType);
+ XorList.push_back(Diff);
+ } else {
+ // If there's only one load per block, we just compare the loaded values.
+ Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2);
+ }
+ }
+
+ auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> {
+ std::vector<Value *> OutList;
+ for (unsigned i = 0; i < InList.size() - 1; i = i + 2) {
+ Value *Or = Builder.CreateOr(InList[i], InList[i + 1]);
+ OutList.push_back(Or);
+ }
+ if (InList.size() % 2 != 0)
+ OutList.push_back(InList.back());
+ return OutList;
+ };
+
+ if (!Cmp) {
+ // Pairwise OR the XOR results.
+ OrList = pairWiseOr(XorList);
+
+ // Pairwise OR the OR results until one result left.
+ while (OrList.size() != 1) {
+ OrList = pairWiseOr(OrList);
+ }
+ Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0));
+ }
+
+ return Cmp;
+}
+
+void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
+ unsigned &LoadIndex) {
+ Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex);
+
+ BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
+ ? EndBlock
+ : LoadCmpBlocks[BlockIndex + 1];
+ // Early exit branch if difference found to ResultBlock. Otherwise,
+ // continue to next LoadCmpBlock or EndBlock.
+ BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
+ Builder.Insert(CmpBr);
+
+ // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
+ // since early exit to ResultBlock was not taken (no difference was found in
+ // any of the bytes).
+ if (BlockIndex == LoadCmpBlocks.size() - 1) {
+ Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
+ PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
+ }
+}
+
+// This function creates the IR intructions for loading and comparing using the
+// given LoadSize. It loads the number of bytes specified by LoadSize from each
+// source of the memcmp parameters. It then does a subtract to see if there was
+// a difference in the loaded values. If a difference is found, it branches
+// with an early exit to the ResultBlock for calculating which source was
+// larger. Otherwise, it falls through to the either the next LoadCmpBlock or
+// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
+// a special case through emitLoadCompareByteBlock. The special handling can
+// simply subtract the loaded values and add it to the result phi node.
+void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
+ // There is one load per block in this case, BlockIndex == LoadIndex.
+ const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex];
+
+ if (CurLoadEntry.LoadSize == 1) {
+ MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex,
+ CurLoadEntry.getGEPIndex());
+ return;
+ }
+
+ Type *LoadSizeType =
+ IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
+ Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+ assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
+
+ Value *Source1 = CI->getArgOperand(0);
+ Value *Source2 = CI->getArgOperand(1);
+
+ Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
+ // Cast source to LoadSizeType*.
+ if (Source1->getType() != LoadSizeType)
+ Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+ if (Source2->getType() != LoadSizeType)
+ Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+ // Get the base address using a GEP.
+ if (CurLoadEntry.Offset != 0) {
+ Source1 = Builder.CreateGEP(
+ LoadSizeType, Source1,
+ ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
+ Source2 = Builder.CreateGEP(
+ LoadSizeType, Source2,
+ ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
+ }
+
+ // Load LoadSizeType from the base address.
+ Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+ Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+ if (DL.isLittleEndian()) {
+ Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
+ Intrinsic::bswap, LoadSizeType);
+ LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
+ LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
+ }
+
+ if (LoadSizeType != MaxLoadType) {
+ LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
+ LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
+ }
+
+ // Add the loaded values to the phi nodes for calculating memcmp result only
+ // if result is not used in a zero equality.
+ if (!IsUsedForZeroCmp) {
+ ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]);
+ ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]);
+ }
+
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2);
+ BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
+ ? EndBlock
+ : LoadCmpBlocks[BlockIndex + 1];
+ // Early exit branch if difference found to ResultBlock. Otherwise, continue
+ // to next LoadCmpBlock or EndBlock.
+ BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp);
+ Builder.Insert(CmpBr);
+
+ // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
+ // since early exit to ResultBlock was not taken (no difference was found in
+ // any of the bytes).
+ if (BlockIndex == LoadCmpBlocks.size() - 1) {
+ Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
+ PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
+ }
+}
+
+// This function populates the ResultBlock with a sequence to calculate the
+// memcmp result. It compares the two loaded source values and returns -1 if
+// src1 < src2 and 1 if src1 > src2.
+void MemCmpExpansion::emitMemCmpResultBlock() {
+ // Special case: if memcmp result is used in a zero equality, result does not
+ // need to be calculated and can simply return 1.
+ if (IsUsedForZeroCmp) {
+ BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
+ Builder.SetInsertPoint(ResBlock.BB, InsertPt);
+ Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1);
+ PhiRes->addIncoming(Res, ResBlock.BB);
+ BranchInst *NewBr = BranchInst::Create(EndBlock);
+ Builder.Insert(NewBr);
+ return;
+ }
+ BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
+ Builder.SetInsertPoint(ResBlock.BB, InsertPt);
+
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1,
+ ResBlock.PhiSrc2);
+
+ Value *Res =
+ Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1),
+ ConstantInt::get(Builder.getInt32Ty(), 1));
+
+ BranchInst *NewBr = BranchInst::Create(EndBlock);
+ Builder.Insert(NewBr);
+ PhiRes->addIncoming(Res, ResBlock.BB);
+}
+
+void MemCmpExpansion::setupResultBlockPHINodes() {
+ Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+ Builder.SetInsertPoint(ResBlock.BB);
+ // Note: this assumes one load per block.
+ ResBlock.PhiSrc1 =
+ Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1");
+ ResBlock.PhiSrc2 =
+ Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2");
+}
+
+void MemCmpExpansion::setupEndBlockPHINodes() {
+ Builder.SetInsertPoint(&EndBlock->front());
+ PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");
+}
+
+Value *MemCmpExpansion::getMemCmpExpansionZeroCase() {
+ unsigned LoadIndex = 0;
+ // This loop populates each of the LoadCmpBlocks with the IR sequence to
+ // handle multiple loads per block.
+ for (unsigned I = 0; I < getNumBlocks(); ++I) {
+ emitLoadCompareBlockMultipleLoads(I, LoadIndex);
+ }
+
+ emitMemCmpResultBlock();
+ return PhiRes;
+}
+
+/// A memcmp expansion that compares equality with 0 and only has one block of
+/// load and compare can bypass the compare, branch, and phi IR that is required
+/// in the general case.
+Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
+ unsigned LoadIndex = 0;
+ Value *Cmp = getCompareLoadPairs(0, LoadIndex);
+ assert(LoadIndex == getNumLoads() && "some entries were not consumed");
+ return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
+}
+
+/// A memcmp expansion that only has one block of load and compare can bypass
+/// the compare, branch, and phi IR that is required in the general case.
+Value *MemCmpExpansion::getMemCmpOneBlock() {
+ assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block");
+
+ Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
+ Value *Source1 = CI->getArgOperand(0);
+ Value *Source2 = CI->getArgOperand(1);
+
+ // Cast source to LoadSizeType*.
+ if (Source1->getType() != LoadSizeType)
+ Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+ if (Source2->getType() != LoadSizeType)
+ Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+ // Load LoadSizeType from the base address.
+ Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+ Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+ if (DL.isLittleEndian() && Size != 1) {
+ Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
+ Intrinsic::bswap, LoadSizeType);
+ LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
+ LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
+ }
+
+ if (Size < 4) {
+ // The i8 and i16 cases don't need compares. We zext the loaded values and
+ // subtract them to get the suitable negative, zero, or positive i32 result.
+ LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty());
+ LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty());
+ return Builder.CreateSub(LoadSrc1, LoadSrc2);
+ }
+
+ // The result of memcmp is negative, zero, or positive, so produce that by
+ // subtracting 2 extended compare bits: sub (ugt, ult).
+ // If a target prefers to use selects to get -1/0/1, they should be able
+ // to transform this later. The inverse transform (going from selects to math)
+ // may not be possible in the DAG because the selects got converted into
+ // branches before we got there.
+ Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2);
+ Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2);
+ Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty());
+ Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty());
+ return Builder.CreateSub(ZextUGT, ZextULT);
+}
+
+// This function expands the memcmp call into an inline expansion and returns
+// the memcmp result.
+Value *MemCmpExpansion::getMemCmpExpansion() {
+ // A memcmp with zero-comparison with only one block of load and compare does
+ // not need to set up any extra blocks. This case could be handled in the DAG,
+ // but since we have all of the machinery to flexibly expand any memcpy here,
+ // we choose to handle this case too to avoid fragmented lowering.
+ if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) {
+ BasicBlock *StartBlock = CI->getParent();
+ EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
+ setupEndBlockPHINodes();
+ createResultBlock();
+
+ // If return value of memcmp is not used in a zero equality, we need to
+ // calculate which source was larger. The calculation requires the
+ // two loaded source values of each load compare block.
+ // These will be saved in the phi nodes created by setupResultBlockPHINodes.
+ if (!IsUsedForZeroCmp) setupResultBlockPHINodes();
+
+ // Create the number of required load compare basic blocks.
+ createLoadCmpBlocks();
+
+ // Update the terminator added by splitBasicBlock to branch to the first
+ // LoadCmpBlock.
+ StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
+ }
+
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ if (IsUsedForZeroCmp)
+ return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock()
+ : getMemCmpExpansionZeroCase();
+
+ // TODO: Handle more than one load pair per block in getMemCmpOneBlock().
+ if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock();
+
+ for (unsigned I = 0; I < getNumBlocks(); ++I) {
+ emitLoadCompareBlock(I);
+ }
+
+ emitMemCmpResultBlock();
+ return PhiRes;
+}
+
+// This function checks to see if an expansion of memcmp can be generated.
+// It checks for constant compare size that is less than the max inline size.
+// If an expansion cannot occur, returns false to leave as a library call.
+// Otherwise, the library call is replaced with a new IR instruction sequence.
+/// We want to transform:
+/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
+/// To:
+/// loadbb:
+/// %0 = bitcast i32* %buffer2 to i8*
+/// %1 = bitcast i32* %buffer1 to i8*
+/// %2 = bitcast i8* %1 to i64*
+/// %3 = bitcast i8* %0 to i64*
+/// %4 = load i64, i64* %2
+/// %5 = load i64, i64* %3
+/// %6 = call i64 @llvm.bswap.i64(i64 %4)
+/// %7 = call i64 @llvm.bswap.i64(i64 %5)
+/// %8 = sub i64 %6, %7
+/// %9 = icmp ne i64 %8, 0
+/// br i1 %9, label %res_block, label %loadbb1
+/// res_block: ; preds = %loadbb2,
+/// %loadbb1, %loadbb
+/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]
+/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]
+/// %10 = icmp ult i64 %phi.src1, %phi.src2
+/// %11 = select i1 %10, i32 -1, i32 1
+/// br label %endblock
+/// loadbb1: ; preds = %loadbb
+/// %12 = bitcast i32* %buffer2 to i8*
+/// %13 = bitcast i32* %buffer1 to i8*
+/// %14 = bitcast i8* %13 to i32*
+/// %15 = bitcast i8* %12 to i32*
+/// %16 = getelementptr i32, i32* %14, i32 2
+/// %17 = getelementptr i32, i32* %15, i32 2
+/// %18 = load i32, i32* %16
+/// %19 = load i32, i32* %17
+/// %20 = call i32 @llvm.bswap.i32(i32 %18)
+/// %21 = call i32 @llvm.bswap.i32(i32 %19)
+/// %22 = zext i32 %20 to i64
+/// %23 = zext i32 %21 to i64
+/// %24 = sub i64 %22, %23
+/// %25 = icmp ne i64 %24, 0
+/// br i1 %25, label %res_block, label %loadbb2
+/// loadbb2: ; preds = %loadbb1
+/// %26 = bitcast i32* %buffer2 to i8*
+/// %27 = bitcast i32* %buffer1 to i8*
+/// %28 = bitcast i8* %27 to i16*
+/// %29 = bitcast i8* %26 to i16*
+/// %30 = getelementptr i16, i16* %28, i16 6
+/// %31 = getelementptr i16, i16* %29, i16 6
+/// %32 = load i16, i16* %30
+/// %33 = load i16, i16* %31
+/// %34 = call i16 @llvm.bswap.i16(i16 %32)
+/// %35 = call i16 @llvm.bswap.i16(i16 %33)
+/// %36 = zext i16 %34 to i64
+/// %37 = zext i16 %35 to i64
+/// %38 = sub i64 %36, %37
+/// %39 = icmp ne i64 %38, 0
+/// br i1 %39, label %res_block, label %loadbb3
+/// loadbb3: ; preds = %loadbb2
+/// %40 = bitcast i32* %buffer2 to i8*
+/// %41 = bitcast i32* %buffer1 to i8*
+/// %42 = getelementptr i8, i8* %41, i8 14
+/// %43 = getelementptr i8, i8* %40, i8 14
+/// %44 = load i8, i8* %42
+/// %45 = load i8, i8* %43
+/// %46 = zext i8 %44 to i32
+/// %47 = zext i8 %45 to i32
+/// %48 = sub i32 %46, %47
+/// br label %endblock
+/// endblock: ; preds = %res_block,
+/// %loadbb3
+/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
+/// ret i32 %phi.res
+static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
+ const TargetLowering *TLI, const DataLayout *DL) {
+ NumMemCmpCalls++;
+
+ // Early exit from expansion if -Oz.
+ if (CI->getFunction()->optForMinSize())
+ return false;
+
+ // Early exit from expansion if size is not a constant.
+ ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+ if (!SizeCast) {
+ NumMemCmpNotConstant++;
+ return false;
+ }
+ const uint64_t SizeVal = SizeCast->getZExtValue();
+
+ if (SizeVal == 0) {
+ return false;
+ }
+
+ // TTI call to check if target would like to expand memcmp. Also, get the
+ // available load sizes.
+ const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
+ const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp);
+ if (!Options) return false;
+
+ const unsigned MaxNumLoads =
+ TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize());
+
+ MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads,
+ IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL);
+
+ // Don't expand if this will require more loads than desired by the target.
+ if (Expansion.getNumLoads() == 0) {
+ NumMemCmpGreaterThanMax++;
+ return false;
+ }
+
+ NumMemCmpInlined++;
+
+ Value *Res = Expansion.getMemCmpExpansion();
+
+ // Replace call with result of expansion and erase call.
+ CI->replaceAllUsesWith(Res);
+ CI->eraseFromParent();
+
+ return true;
+}
+
+
+
+class ExpandMemCmpPass : public FunctionPass {
+public:
+ static char ID;
+
+ ExpandMemCmpPass() : FunctionPass(ID) {
+ initializeExpandMemCmpPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F)) return false;
+
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC) {
+ return false;
+ }
+ const TargetLowering* TL =
+ TPC->getTM<TargetMachine>().getSubtargetImpl(F)->getTargetLowering();
+
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ const TargetTransformInfo *TTI =
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto PA = runImpl(F, TLI, TTI, TL);
+ return !PA.areAllPreserved();
+ }
+
+private:
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI,
+ const TargetLowering* TL);
+ // Returns true if a change was made.
+ bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, const TargetLowering* TL,
+ const DataLayout& DL);
+};
+
+bool ExpandMemCmpPass::runOnBlock(
+ BasicBlock &BB, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, const TargetLowering* TL,
+ const DataLayout& DL) {
+ for (Instruction& I : BB) {
+ CallInst *CI = dyn_cast<CallInst>(&I);
+ if (!CI) {
+ continue;
+ }
+ LibFunc Func;
+ if (TLI->getLibFunc(ImmutableCallSite(CI), Func) &&
+ Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TL, &DL)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+
+PreservedAnalyses ExpandMemCmpPass::runImpl(
+ Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI,
+ const TargetLowering* TL) {
+ const DataLayout& DL = F.getParent()->getDataLayout();
+ bool MadeChanges = false;
+ for (auto BBIt = F.begin(); BBIt != F.end();) {
+ if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) {
+ MadeChanges = true;
+ // If changes were made, restart the function from the beginning, since
+ // the structure of the function was changed.
+ BBIt = F.begin();
+ } else {
+ ++BBIt;
+ }
+ }
+ return MadeChanges ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+} // namespace
+
+char ExpandMemCmpPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ExpandMemCmpPass, "expandmemcmp",
+ "Expand memcmp() to load/stores", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp",
+ "Expand memcmp() to load/stores", false, false)
+
+FunctionPass *llvm::createExpandMemCmpPass() {
+ return new ExpandMemCmpPass();
+}
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index 4ce86f27a7d..b73aeb18382 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -17,9 +17,9 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/ExpandReductions.cpp b/lib/CodeGen/ExpandReductions.cpp
index 70dca3b74b2..abf487a4f19 100644
--- a/lib/CodeGen/ExpandReductions.cpp
+++ b/lib/CodeGen/ExpandReductions.cpp
@@ -95,7 +95,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
// and it can't be handled by generating this shuffle sequence.
// TODO: Implement scalarization of ordered reductions here for targets
// without native support.
- if (!II->getFastMathFlags().unsafeAlgebra())
+ if (!II->getFastMathFlags().isFast())
continue;
Vec = II->getArgOperand(1);
break;
diff --git a/lib/CodeGen/FEntryInserter.cpp b/lib/CodeGen/FEntryInserter.cpp
index 9781338f952..07cb7016139 100644
--- a/lib/CodeGen/FEntryInserter.cpp
+++ b/lib/CodeGen/FEntryInserter.cpp
@@ -15,10 +15,10 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
using namespace llvm;
diff --git a/lib/CodeGen/GCRootLowering.cpp b/lib/CodeGen/GCRootLowering.cpp
index 35246545ca9..2064ce7ac7b 100644
--- a/lib/CodeGen/GCRootLowering.cpp
+++ b/lib/CodeGen/GCRootLowering.cpp
@@ -18,14 +18,14 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 8e31ed0a015..45eb605c3c2 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -54,7 +54,7 @@
#include "llvm/Support/LowLevelTypeImpl.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetIntrinsicInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
diff --git a/lib/CodeGen/GlobalISel/Legalizer.cpp b/lib/CodeGen/GlobalISel/Legalizer.cpp
index fb954f3c3f1..59f34d730d0 100644
--- a/lib/CodeGen/GlobalISel/Legalizer.cpp
+++ b/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -20,9 +20,9 @@
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <iterator>
diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 99f605abe06..a8cfe0b89a0 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -173,12 +173,18 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
MIRBuilder.setInstr(MI);
+ int64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ int64_t NarrowSize = NarrowTy.getSizeInBits();
+
switch (MI.getOpcode()) {
default:
return UnableToLegalize;
case TargetOpcode::G_IMPLICIT_DEF: {
- int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() /
- NarrowTy.getSizeInBits();
+ // FIXME: add support for when SizeOp0 isn't an exact multiple of
+ // NarrowSize.
+ if (SizeOp0 % NarrowSize != 0)
+ return UnableToLegalize;
+ int NumParts = SizeOp0 / NarrowSize;
SmallVector<unsigned, 2> DstRegs;
for (int i = 0; i < NumParts; ++i) {
@@ -191,9 +197,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
return Legalized;
}
case TargetOpcode::G_ADD: {
+ // FIXME: add support for when SizeOp0 isn't an exact multiple of
+ // NarrowSize.
+ if (SizeOp0 % NarrowSize != 0)
+ return UnableToLegalize;
// Expand in terms of carry-setting/consuming G_ADDE instructions.
- int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() /
- NarrowTy.getSizeInBits();
+ int NumParts = SizeOp0 / NarrowTy.getSizeInBits();
SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs;
extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs);
@@ -221,9 +230,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
if (TypeIdx != 1)
return UnableToLegalize;
- int64_t NarrowSize = NarrowTy.getSizeInBits();
- int NumParts =
- MRI.getType(MI.getOperand(1).getReg()).getSizeInBits() / NarrowSize;
+ int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ // FIXME: add support for when SizeOp1 isn't an exact multiple of
+ // NarrowSize.
+ if (SizeOp1 % NarrowSize != 0)
+ return UnableToLegalize;
+ int NumParts = SizeOp1 / NarrowSize;
SmallVector<unsigned, 2> SrcRegs, DstRegs;
SmallVector<uint64_t, 2> Indexes;
@@ -270,12 +282,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
return Legalized;
}
case TargetOpcode::G_INSERT: {
- if (TypeIdx != 0)
+ // FIXME: add support for when SizeOp0 isn't an exact multiple of
+ // NarrowSize.
+ if (SizeOp0 % NarrowSize != 0)
return UnableToLegalize;
- int64_t NarrowSize = NarrowTy.getSizeInBits();
- int NumParts =
- MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+ int NumParts = SizeOp0 / NarrowSize;
SmallVector<unsigned, 2> SrcRegs, DstRegs;
SmallVector<uint64_t, 2> Indexes;
@@ -330,9 +342,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
return Legalized;
}
case TargetOpcode::G_LOAD: {
- unsigned NarrowSize = NarrowTy.getSizeInBits();
- int NumParts =
- MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+ // FIXME: add support for when SizeOp0 isn't an exact multiple of
+ // NarrowSize.
+ if (SizeOp0 % NarrowSize != 0)
+ return UnableToLegalize;
+ int NumParts = SizeOp0 / NarrowSize;
LLT OffsetTy = LLT::scalar(
MRI.getType(MI.getOperand(1).getReg()).getScalarSizeInBits());
@@ -357,9 +371,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
return Legalized;
}
case TargetOpcode::G_STORE: {
- unsigned NarrowSize = NarrowTy.getSizeInBits();
- int NumParts =
- MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+ // FIXME: add support for when SizeOp0 isn't an exact multiple of
+ // NarrowSize.
+ if (SizeOp0 % NarrowSize != 0)
+ return UnableToLegalize;
+ int NumParts = SizeOp0 / NarrowSize;
LLT OffsetTy = LLT::scalar(
MRI.getType(MI.getOperand(1).getReg()).getScalarSizeInBits());
@@ -381,9 +397,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
return Legalized;
}
case TargetOpcode::G_CONSTANT: {
- unsigned NarrowSize = NarrowTy.getSizeInBits();
- int NumParts =
- MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+ // FIXME: add support for when SizeOp0 isn't an exact multiple of
+ // NarrowSize.
+ if (SizeOp0 % NarrowSize != 0)
+ return UnableToLegalize;
+ int NumParts = SizeOp0 / NarrowSize;
const APInt &Cst = MI.getOperand(1).getCImm()->getValue();
LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext();
@@ -410,9 +428,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
// ...
// AN = BinOp<Ty/N> BN, CN
// A = G_MERGE_VALUES A1, ..., AN
- unsigned NarrowSize = NarrowTy.getSizeInBits();
- int NumParts =
- MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+
+ // FIXME: add support for when SizeOp0 isn't an exact multiple of
+ // NarrowSize.
+ if (SizeOp0 % NarrowSize != 0)
+ return UnableToLegalize;
+ int NumParts = SizeOp0 / NarrowSize;
// List the registers where the destination will be scattered.
SmallVector<unsigned, 2> DstRegs;
@@ -854,7 +875,12 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
case TargetOpcode::G_ADD: {
unsigned NarrowSize = NarrowTy.getSizeInBits();
unsigned DstReg = MI.getOperand(0).getReg();
- int NumParts = MRI.getType(DstReg).getSizeInBits() / NarrowSize;
+ unsigned Size = MRI.getType(DstReg).getSizeInBits();
+ int NumParts = Size / NarrowSize;
+ // FIXME: Don't know how to handle the situation where the small vectors
+ // aren't all the same size yet.
+ if (Size % NarrowSize != 0)
+ return UnableToLegalize;
MIRBuilder.setInstr(MI);
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index e7a46eadb44..074cfa61a29 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -28,46 +28,130 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOpcodes.h"
#include <algorithm>
-#include <cassert>
-#include <tuple>
-#include <utility>
-
+#include <map>
using namespace llvm;
-LegalizerInfo::LegalizerInfo() {
- DefaultActions[TargetOpcode::G_IMPLICIT_DEF] = NarrowScalar;
-
- // FIXME: these two can be legalized to the fundamental load/store Jakob
- // proposed. Once loads & stores are supported.
- DefaultActions[TargetOpcode::G_ANYEXT] = Legal;
- DefaultActions[TargetOpcode::G_TRUNC] = Legal;
+LegalizerInfo::LegalizerInfo() : TablesInitialized(false) {
+ // Set defaults.
+ // FIXME: these two (G_ANYEXT and G_TRUNC?) can be legalized to the
+ // fundamental load/store Jakob proposed. Once loads & stores are supported.
+ setScalarAction(TargetOpcode::G_ANYEXT, 1, {{1, Legal}});
+ setScalarAction(TargetOpcode::G_ZEXT, 1, {{1, Legal}});
+ setScalarAction(TargetOpcode::G_SEXT, 1, {{1, Legal}});
+ setScalarAction(TargetOpcode::G_TRUNC, 0, {{1, Legal}});
+ setScalarAction(TargetOpcode::G_TRUNC, 1, {{1, Legal}});
- DefaultActions[TargetOpcode::G_INTRINSIC] = Legal;
- DefaultActions[TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS] = Legal;
+ setScalarAction(TargetOpcode::G_INTRINSIC, 0, {{1, Legal}});
+ setScalarAction(TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS, 0, {{1, Legal}});
- DefaultActions[TargetOpcode::G_ADD] = NarrowScalar;
- DefaultActions[TargetOpcode::G_LOAD] = NarrowScalar;
- DefaultActions[TargetOpcode::G_STORE] = NarrowScalar;
- DefaultActions[TargetOpcode::G_OR] = NarrowScalar;
+ setLegalizeScalarToDifferentSizeStrategy(
+ TargetOpcode::G_IMPLICIT_DEF, 0, narrowToSmallerAndUnsupportedIfTooSmall);
+ setLegalizeScalarToDifferentSizeStrategy(
+ TargetOpcode::G_ADD, 0, widenToLargerTypesAndNarrowToLargest);
+ setLegalizeScalarToDifferentSizeStrategy(
+ TargetOpcode::G_OR, 0, widenToLargerTypesAndNarrowToLargest);
+ setLegalizeScalarToDifferentSizeStrategy(
+ TargetOpcode::G_LOAD, 0, narrowToSmallerAndUnsupportedIfTooSmall);
+ setLegalizeScalarToDifferentSizeStrategy(
+ TargetOpcode::G_STORE, 0, narrowToSmallerAndUnsupportedIfTooSmall);
- DefaultActions[TargetOpcode::G_BRCOND] = WidenScalar;
- DefaultActions[TargetOpcode::G_INSERT] = NarrowScalar;
- DefaultActions[TargetOpcode::G_EXTRACT] = NarrowScalar;
- DefaultActions[TargetOpcode::G_FNEG] = Lower;
+ setLegalizeScalarToDifferentSizeStrategy(
+ TargetOpcode::G_BRCOND, 0, widenToLargerTypesUnsupportedOtherwise);
+ setLegalizeScalarToDifferentSizeStrategy(
+ TargetOpcode::G_INSERT, 0, narrowToSmallerAndUnsupportedIfTooSmall);
+ setLegalizeScalarToDifferentSizeStrategy(
+ TargetOpcode::G_EXTRACT, 0, narrowToSmallerAndUnsupportedIfTooSmall);
+ setLegalizeScalarToDifferentSizeStrategy(
+ TargetOpcode::G_EXTRACT, 1, narrowToSmallerAndUnsupportedIfTooSmall);
+ setScalarAction(TargetOpcode::G_FNEG, 0, {{1, Lower}});
}
void LegalizerInfo::computeTables() {
- for (unsigned Opcode = 0; Opcode <= LastOp - FirstOp; ++Opcode) {
- for (unsigned Idx = 0, End = Actions[Opcode].size(); Idx != End; ++Idx) {
- for (auto &Action : Actions[Opcode][Idx]) {
- LLT Ty = Action.first;
- if (!Ty.isVector())
- continue;
-
- auto &Entry = MaxLegalVectorElts[std::make_pair(Opcode + FirstOp,
- Ty.getElementType())];
- Entry = std::max(Entry, Ty.getNumElements());
+ assert(TablesInitialized == false);
+
+ for (unsigned OpcodeIdx = 0; OpcodeIdx <= LastOp - FirstOp; ++OpcodeIdx) {
+ const unsigned Opcode = FirstOp + OpcodeIdx;
+ for (unsigned TypeIdx = 0; TypeIdx != SpecifiedActions[OpcodeIdx].size();
+ ++TypeIdx) {
+ // 0. Collect information specified through the setAction API, i.e.
+ // for specific bit sizes.
+ // For scalar types:
+ SizeAndActionsVec ScalarSpecifiedActions;
+ // For pointer types:
+ std::map<uint16_t, SizeAndActionsVec> AddressSpace2SpecifiedActions;
+ // For vector types:
+ std::map<uint16_t, SizeAndActionsVec> ElemSize2SpecifiedActions;
+ for (auto LLT2Action : SpecifiedActions[OpcodeIdx][TypeIdx]) {
+ const LLT Type = LLT2Action.first;
+ const LegalizeAction Action = LLT2Action.second;
+
+ auto SizeAction = std::make_pair(Type.getSizeInBits(), Action);
+ if (Type.isPointer())
+ AddressSpace2SpecifiedActions[Type.getAddressSpace()].push_back(
+ SizeAction);
+ else if (Type.isVector())
+ ElemSize2SpecifiedActions[Type.getElementType().getSizeInBits()]
+ .push_back(SizeAction);
+ else
+ ScalarSpecifiedActions.push_back(SizeAction);
+ }
+
+ // 1. Handle scalar types
+ {
+ // Decide how to handle bit sizes for which no explicit specification
+ // was given.
+ SizeChangeStrategy S = &unsupportedForDifferentSizes;
+ if (TypeIdx < ScalarSizeChangeStrategies[OpcodeIdx].size() &&
+ ScalarSizeChangeStrategies[OpcodeIdx][TypeIdx] != nullptr)
+ S = ScalarSizeChangeStrategies[OpcodeIdx][TypeIdx];
+ std::sort(ScalarSpecifiedActions.begin(), ScalarSpecifiedActions.end());
+ checkPartialSizeAndActionsVector(ScalarSpecifiedActions);
+ setScalarAction(Opcode, TypeIdx, S(ScalarSpecifiedActions));
}
+
+ // 2. Handle pointer types
+ for (auto PointerSpecifiedActions : AddressSpace2SpecifiedActions) {
+ std::sort(PointerSpecifiedActions.second.begin(),
+ PointerSpecifiedActions.second.end());
+ checkPartialSizeAndActionsVector(PointerSpecifiedActions.second);
+ // For pointer types, we assume that there isn't a meaningfull way
+ // to change the number of bits used in the pointer.
+ setPointerAction(
+ Opcode, TypeIdx, PointerSpecifiedActions.first,
+ unsupportedForDifferentSizes(PointerSpecifiedActions.second));
+ }
+
+ // 3. Handle vector types
+ SizeAndActionsVec ElementSizesSeen;
+ for (auto VectorSpecifiedActions : ElemSize2SpecifiedActions) {
+ std::sort(VectorSpecifiedActions.second.begin(),
+ VectorSpecifiedActions.second.end());
+ const uint16_t ElementSize = VectorSpecifiedActions.first;
+ ElementSizesSeen.push_back({ElementSize, Legal});
+ checkPartialSizeAndActionsVector(VectorSpecifiedActions.second);
+ // For vector types, we assume that the best way to adapt the number
+ // of elements is to the next larger number of elements type for which
+ // the vector type is legal, unless there is no such type. In that case,
+ // legalize towards a vector type with a smaller number of elements.
+ SizeAndActionsVec NumElementsActions;
+ for (SizeAndAction BitsizeAndAction : VectorSpecifiedActions.second) {
+ assert(BitsizeAndAction.first % ElementSize == 0);
+ const uint16_t NumElements = BitsizeAndAction.first / ElementSize;
+ NumElementsActions.push_back({NumElements, BitsizeAndAction.second});
+ }
+ setVectorNumElementAction(
+ Opcode, TypeIdx, ElementSize,
+ moreToWiderTypesAndLessToWidest(NumElementsActions));
+ }
+ std::sort(ElementSizesSeen.begin(), ElementSizesSeen.end());
+ SizeChangeStrategy VectorElementSizeChangeStrategy =
+ &unsupportedForDifferentSizes;
+ if (TypeIdx < VectorElementSizeChangeStrategies[OpcodeIdx].size() &&
+ VectorElementSizeChangeStrategies[OpcodeIdx][TypeIdx] != nullptr)
+ VectorElementSizeChangeStrategy =
+ VectorElementSizeChangeStrategies[OpcodeIdx][TypeIdx];
+ setScalarInVectorAction(
+ Opcode, TypeIdx, VectorElementSizeChangeStrategy(ElementSizesSeen));
}
}
@@ -90,69 +174,24 @@ LegalizerInfo::getAction(const InstrAspect &Aspect) const {
Aspect.Opcode == TargetOpcode::G_UNMERGE_VALUES)
return std::make_pair(Legal, Aspect.Type);
- LLT Ty = Aspect.Type;
- LegalizeAction Action = findInActions(Aspect);
- // LegalizerHelper is not able to handle non-power-of-2 types right now, so do
- // not try to legalize them unless they are marked as Legal or Custom.
- // FIXME: This is a temporary hack until the general non-power-of-2
- // legalization works.
- if (!isPowerOf2_64(Ty.getSizeInBits()) &&
- !(Action == Legal || Action == Custom))
- return std::make_pair(Unsupported, LLT());
-
- if (Action != NotFound)
- return findLegalAction(Aspect, Action);
-
- unsigned Opcode = Aspect.Opcode;
- if (!Ty.isVector()) {
- auto DefaultAction = DefaultActions.find(Aspect.Opcode);
- if (DefaultAction != DefaultActions.end() && DefaultAction->second == Legal)
- return std::make_pair(Legal, Ty);
-
- if (DefaultAction != DefaultActions.end() && DefaultAction->second == Lower)
- return std::make_pair(Lower, Ty);
-
- if (DefaultAction == DefaultActions.end() ||
- DefaultAction->second != NarrowScalar)
- return std::make_pair(Unsupported, LLT());
- return findLegalAction(Aspect, NarrowScalar);
- }
-
- LLT EltTy = Ty.getElementType();
- int NumElts = Ty.getNumElements();
-
- auto ScalarAction = ScalarInVectorActions.find(std::make_pair(Opcode, EltTy));
- if (ScalarAction != ScalarInVectorActions.end() &&
- ScalarAction->second != Legal)
- return findLegalAction(Aspect, ScalarAction->second);
-
- // The element type is legal in principle, but the number of elements is
- // wrong.
- auto MaxLegalElts = MaxLegalVectorElts.lookup(std::make_pair(Opcode, EltTy));
- if (MaxLegalElts > NumElts)
- return findLegalAction(Aspect, MoreElements);
-
- if (MaxLegalElts == 0) {
- // Scalarize if there's no legal vector type, which is just a special case
- // of FewerElements.
- return std::make_pair(FewerElements, EltTy);
- }
-
- return findLegalAction(Aspect, FewerElements);
+ if (Aspect.Type.isScalar() || Aspect.Type.isPointer())
+ return findScalarLegalAction(Aspect);
+ assert(Aspect.Type.isVector());
+ return findVectorLegalAction(Aspect);
}
std::tuple<LegalizerInfo::LegalizeAction, unsigned, LLT>
LegalizerInfo::getAction(const MachineInstr &MI,
const MachineRegisterInfo &MRI) const {
SmallBitVector SeenTypes(8);
- const MCInstrDesc &MCID = MI.getDesc();
- const MCOperandInfo *OpInfo = MCID.OpInfo;
- for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) {
+ const MCOperandInfo *OpInfo = MI.getDesc().OpInfo;
+ // FIXME: probably we'll need to cache the results here somehow?
+ for (unsigned i = 0; i < MI.getDesc().getNumOperands(); ++i) {
if (!OpInfo[i].isGenericType())
continue;
- // We don't want to repeatedly check the same operand index, that
- // could get expensive.
+ // We must only record actions once for each TypeIdx; otherwise we'd
+ // try to legalize operands multiple times down the line.
unsigned TypeIdx = OpInfo[i].getGenericTypeIndex();
if (SeenTypes[TypeIdx])
continue;
@@ -172,38 +211,166 @@ bool LegalizerInfo::isLegal(const MachineInstr &MI,
return std::get<0>(getAction(MI, MRI)) == Legal;
}
-Optional<LLT> LegalizerInfo::findLegalType(const InstrAspect &Aspect,
- LegalizeAction Action) const {
- switch(Action) {
- default:
- llvm_unreachable("Cannot find legal type");
+bool LegalizerInfo::legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ return false;
+}
+
+LegalizerInfo::SizeAndActionsVec
+LegalizerInfo::increaseToLargerTypesAndDecreaseToLargest(
+ const SizeAndActionsVec &v, LegalizeAction IncreaseAction,
+ LegalizeAction DecreaseAction) {
+ SizeAndActionsVec result;
+ unsigned LargestSizeSoFar = 0;
+ if (v.size() >= 1 && v[0].first != 1)
+ result.push_back({1, IncreaseAction});
+ for (size_t i = 0; i < v.size(); ++i) {
+ result.push_back(v[i]);
+ LargestSizeSoFar = v[i].first;
+ if (i + 1 < v.size() && v[i + 1].first != v[i].first + 1) {
+ result.push_back({LargestSizeSoFar + 1, IncreaseAction});
+ LargestSizeSoFar = v[i].first + 1;
+ }
+ }
+ result.push_back({LargestSizeSoFar + 1, DecreaseAction});
+ return result;
+}
+
+LegalizerInfo::SizeAndActionsVec
+LegalizerInfo::decreaseToSmallerTypesAndIncreaseToSmallest(
+ const SizeAndActionsVec &v, LegalizeAction DecreaseAction,
+ LegalizeAction IncreaseAction) {
+ SizeAndActionsVec result;
+ if (v.size() == 0 || v[0].first != 1)
+ result.push_back({1, IncreaseAction});
+ for (size_t i = 0; i < v.size(); ++i) {
+ result.push_back(v[i]);
+ if (i + 1 == v.size() || v[i + 1].first != v[i].first + 1) {
+ result.push_back({v[i].first + 1, DecreaseAction});
+ }
+ }
+ return result;
+}
+
+LegalizerInfo::SizeAndAction
+LegalizerInfo::findAction(const SizeAndActionsVec &Vec, const uint32_t Size) {
+ assert(Size >= 1);
+ // Find the last element in Vec that has a bitsize equal to or smaller than
+ // the requested bit size.
+ // That is the element just before the first element that is bigger than Size.
+ auto VecIt = std::upper_bound(
+ Vec.begin(), Vec.end(), Size,
+ [](const uint32_t Size, const SizeAndAction lhs) -> bool {
+ return Size < lhs.first;
+ });
+ assert(VecIt != Vec.begin() && "Does Vec not start with size 1?");
+ --VecIt;
+ int VecIdx = VecIt - Vec.begin();
+
+ LegalizeAction Action = Vec[VecIdx].second;
+ switch (Action) {
case Legal:
case Lower:
case Libcall:
case Custom:
- return Aspect.Type;
+ return {Size, Action};
+ case FewerElements:
+ // FIXME: is this special case still needed and correct?
+ // Special case for scalarization:
+ if (Vec == SizeAndActionsVec({{1, FewerElements}}))
+ return {1, FewerElements};
+ LLVM_FALLTHROUGH;
case NarrowScalar: {
- return findLegalizableSize(
- Aspect, [&](LLT Ty) -> LLT { return Ty.halfScalarSize(); });
- }
- case WidenScalar: {
- return findLegalizableSize(Aspect, [&](LLT Ty) -> LLT {
- return Ty.getSizeInBits() < 8 ? LLT::scalar(8) : Ty.doubleScalarSize();
- });
- }
- case FewerElements: {
- return findLegalizableSize(
- Aspect, [&](LLT Ty) -> LLT { return Ty.halfElements(); });
+ // The following needs to be a loop, as for now, we do allow needing to
+ // go over "Unsupported" bit sizes before finding a legalizable bit size.
+ // e.g. (s8, WidenScalar), (s9, Unsupported), (s32, Legal). if Size==8,
+ // we need to iterate over s9, and then to s32 to return (s32, Legal).
+ // If we want to get rid of the below loop, we should have stronger asserts
+ // when building the SizeAndActionsVecs, probably not allowing
+ // "Unsupported" unless at the ends of the vector.
+ for (int i = VecIdx - 1; i >= 0; --i)
+ if (!needsLegalizingToDifferentSize(Vec[i].second) &&
+ Vec[i].second != Unsupported)
+ return {Vec[i].first, Action};
+ llvm_unreachable("");
}
+ case WidenScalar:
case MoreElements: {
- return findLegalizableSize(
- Aspect, [&](LLT Ty) -> LLT { return Ty.doubleElements(); });
+ // See above, the following needs to be a loop, at least for now.
+ for (std::size_t i = VecIdx + 1; i < Vec.size(); ++i)
+ if (!needsLegalizingToDifferentSize(Vec[i].second) &&
+ Vec[i].second != Unsupported)
+ return {Vec[i].first, Action};
+ llvm_unreachable("");
}
+ case Unsupported:
+ return {Size, Unsupported};
+ case NotFound:
+ llvm_unreachable("NotFound");
}
+ llvm_unreachable("Action has an unknown enum value");
}
-bool LegalizerInfo::legalizeCustom(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const {
- return false;
+std::pair<LegalizerInfo::LegalizeAction, LLT>
+LegalizerInfo::findScalarLegalAction(const InstrAspect &Aspect) const {
+ assert(Aspect.Type.isScalar() || Aspect.Type.isPointer());
+ if (Aspect.Opcode < FirstOp || Aspect.Opcode > LastOp)
+ return {NotFound, LLT()};
+ const unsigned OpcodeIdx = Aspect.Opcode - FirstOp;
+ if (Aspect.Type.isPointer() &&
+ AddrSpace2PointerActions[OpcodeIdx].find(Aspect.Type.getAddressSpace()) ==
+ AddrSpace2PointerActions[OpcodeIdx].end()) {
+ return {NotFound, LLT()};
+ }
+ const SmallVector<SizeAndActionsVec, 1> &Actions =
+ Aspect.Type.isPointer()
+ ? AddrSpace2PointerActions[OpcodeIdx]
+ .find(Aspect.Type.getAddressSpace())
+ ->second
+ : ScalarActions[OpcodeIdx];
+ if (Aspect.Idx >= Actions.size())
+ return {NotFound, LLT()};
+ const SizeAndActionsVec &Vec = Actions[Aspect.Idx];
+ // FIXME: speed up this search, e.g. by using a results cache for repeated
+ // queries?
+ auto SizeAndAction = findAction(Vec, Aspect.Type.getSizeInBits());
+ return {SizeAndAction.second,
+ Aspect.Type.isScalar() ? LLT::scalar(SizeAndAction.first)
+ : LLT::pointer(Aspect.Type.getAddressSpace(),
+ SizeAndAction.first)};
+}
+
+std::pair<LegalizerInfo::LegalizeAction, LLT>
+LegalizerInfo::findVectorLegalAction(const InstrAspect &Aspect) const {
+ assert(Aspect.Type.isVector());
+ // First legalize the vector element size, then legalize the number of
+ // lanes in the vector.
+ if (Aspect.Opcode < FirstOp || Aspect.Opcode > LastOp)
+ return {NotFound, Aspect.Type};
+ const unsigned OpcodeIdx = Aspect.Opcode - FirstOp;
+ const unsigned TypeIdx = Aspect.Idx;
+ if (TypeIdx >= ScalarInVectorActions[OpcodeIdx].size())
+ return {NotFound, Aspect.Type};
+ const SizeAndActionsVec &ElemSizeVec =
+ ScalarInVectorActions[OpcodeIdx][TypeIdx];
+
+ LLT IntermediateType;
+ auto ElementSizeAndAction =
+ findAction(ElemSizeVec, Aspect.Type.getScalarSizeInBits());
+ IntermediateType =
+ LLT::vector(Aspect.Type.getNumElements(), ElementSizeAndAction.first);
+ if (ElementSizeAndAction.second != Legal)
+ return {ElementSizeAndAction.second, IntermediateType};
+
+ auto i = NumElements2Actions[OpcodeIdx].find(
+ IntermediateType.getScalarSizeInBits());
+ if (i == NumElements2Actions[OpcodeIdx].end()) {
+ return {NotFound, IntermediateType};
+ }
+ const SizeAndActionsVec &NumElementsVec = (*i).second[TypeIdx];
+ auto NumElementsAndAction =
+ findAction(NumElementsVec, IntermediateType.getNumElements());
+ return {NumElementsAndAction.second,
+ LLT::vector(NumElementsAndAction.first,
+ IntermediateType.getScalarSizeInBits())};
}
diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index b4fe5f15fdd..230c9ef04c5 100644
--- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -15,8 +15,8 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DebugInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index f117c609453..3854e9b263d 100644
--- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -19,10 +19,10 @@
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp
index a9f3d73a294..9ae0f970f42 100644
--- a/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/lib/CodeGen/GlobalISel/Utils.cpp
@@ -17,9 +17,9 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Constants.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#define DEBUG_TYPE "globalisel-utils"
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 08720d1271f..6d2a55c65f4 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -31,6 +31,7 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -40,7 +41,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp
index bf0f88d49a8..b1cac2a107d 100644
--- a/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/lib/CodeGen/ImplicitNullChecks.cpp
@@ -44,6 +44,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/LLVMContext.h"
@@ -51,7 +52,6 @@
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 2e991de6221..a383b72dce6 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -41,6 +41,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/Support/BlockFrequency.h"
#include "llvm/Support/BranchProbability.h"
@@ -49,7 +50,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/LiveDebugValues.cpp b/lib/CodeGen/LiveDebugValues.cpp
index a45b1e39fee..9328e2d900a 100644
--- a/lib/CodeGen/LiveDebugValues.cpp
+++ b/lib/CodeGen/LiveDebugValues.cpp
@@ -36,6 +36,8 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
@@ -46,8 +48,6 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 0c81306a9a5..1923a8c2529 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -39,6 +39,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
@@ -51,7 +52,6 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 92cca1a5495..61fbfdd64a2 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -16,10 +16,10 @@
#include "llvm/CodeGen/CalcSpillWeights.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index a9aec926115..f9c5652e8a1 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -34,10 +34,10 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include <algorithm>
using namespace llvm;
diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp
index 2eab0376da2..33ae476bf4a 100644
--- a/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -30,7 +30,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MIRCanonicalizerPass.cpp b/lib/CodeGen/MIRCanonicalizerPass.cpp
new file mode 100644
index 00000000000..62596440c73
--- /dev/null
+++ b/lib/CodeGen/MIRCanonicalizerPass.cpp
@@ -0,0 +1,626 @@
+//===-------------- MIRCanonicalizer.cpp - MIR Canonicalizer --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The purpose of this pass is to employ a canonical code transformation so
+// that code compiled with slightly different IR passes can be diffed more
+// effectively than otherwise. This is done by renaming vregs in a given
+// LiveRange in a canonical way. This pass also does a pseudo-scheduling to
+// move defs closer to their use inorder to reduce diffs caused by slightly
+// different schedules.
+//
+// Basic Usage:
+//
+// llc -o - -run-pass mir-canonicalizer example.mir
+//
+// Reorders instructions canonically.
+// Renames virtual register operands canonically.
+// Strips certain MIR artifacts (optionally).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <queue>
+
+using namespace llvm;
+
+namespace llvm {
+extern char &MIRCanonicalizerID;
+} // namespace llvm
+
+#define DEBUG_TYPE "mir-canonicalizer"
+
+static cl::opt<unsigned>
+CanonicalizeFunctionNumber("canon-nth-function", cl::Hidden, cl::init(~0u),
+ cl::value_desc("N"),
+ cl::desc("Function number to canonicalize."));
+
+static cl::opt<unsigned>
+CanonicalizeBasicBlockNumber("canon-nth-basicblock", cl::Hidden, cl::init(~0u),
+ cl::value_desc("N"),
+ cl::desc("BasicBlock number to canonicalize."));
+
+namespace {
+
+class MIRCanonicalizer : public MachineFunctionPass {
+public:
+ static char ID;
+ MIRCanonicalizer() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override {
+ return "Rename register operands in a canonical ordering.";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // end anonymous namespace
+
+enum VRType { RSE_Reg = 0, RSE_FrameIndex, RSE_NewCandidate };
+class TypedVReg {
+ VRType type;
+ unsigned reg;
+
+public:
+ TypedVReg(unsigned reg) : type(RSE_Reg), reg(reg) {}
+ TypedVReg(VRType type) : type(type), reg(~0U) {
+ assert(type != RSE_Reg && "Expected a non-register type.");
+ }
+
+ bool isReg() const { return type == RSE_Reg; }
+ bool isFrameIndex() const { return type == RSE_FrameIndex; }
+ bool isCandidate() const { return type == RSE_NewCandidate; }
+
+ VRType getType() const { return type; }
+ unsigned getReg() const {
+ assert(this->isReg() && "Expected a virtual or physical register.");
+ return reg;
+ }
+};
+
+char MIRCanonicalizer::ID;
+
+char &llvm::MIRCanonicalizerID = MIRCanonicalizer::ID;
+
+INITIALIZE_PASS_BEGIN(MIRCanonicalizer, "mir-canonicalizer",
+ "Rename Register Operands Canonically", false, false)
+
+INITIALIZE_PASS_END(MIRCanonicalizer, "mir-canonicalizer",
+ "Rename Register Operands Canonically", false, false)
+
+static std::vector<MachineBasicBlock *> GetRPOList(MachineFunction &MF) {
+ ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
+ std::vector<MachineBasicBlock *> RPOList;
+ for (auto MBB : RPOT) {
+ RPOList.push_back(MBB);
+ }
+
+ return RPOList;
+}
+
+// Set a dummy vreg. We use this vregs register class to generate throw-away
+// vregs that are used to skip vreg numbers so that vreg numbers line up.
+static unsigned GetDummyVReg(const MachineFunction &MF) {
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ continue;
+ return MO.getReg();
+ }
+ }
+ }
+
+ return ~0U;
+}
+
+static bool rescheduleCanonically(MachineBasicBlock *MBB) {
+
+ bool Changed = false;
+
+ // Calculates the distance of MI from the begining of its parent BB.
+ auto getInstrIdx = [](const MachineInstr &MI) {
+ unsigned i = 0;
+ for (auto &CurMI : *MI.getParent()) {
+ if (&CurMI == &MI)
+ return i;
+ i++;
+ }
+ return ~0U;
+ };
+
+ // Pre-Populate vector of instructions to reschedule so that we don't
+ // clobber the iterator.
+ std::vector<MachineInstr *> Instructions;
+ for (auto &MI : *MBB) {
+ Instructions.push_back(&MI);
+ }
+
+ for (auto *II : Instructions) {
+ if (II->getNumOperands() == 0)
+ continue;
+
+ MachineOperand &MO = II->getOperand(0);
+ if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ continue;
+
+ DEBUG(dbgs() << "Operand " << 0 << " of "; II->dump(); MO.dump(););
+
+ MachineInstr *Def = II;
+ unsigned Distance = ~0U;
+ MachineInstr *UseToBringDefCloserTo = nullptr;
+ MachineRegisterInfo *MRI = &MBB->getParent()->getRegInfo();
+ for (auto &UO : MRI->use_nodbg_operands(MO.getReg())) {
+ MachineInstr *UseInst = UO.getParent();
+
+ const unsigned DefLoc = getInstrIdx(*Def);
+ const unsigned UseLoc = getInstrIdx(*UseInst);
+ const unsigned Delta = (UseLoc - DefLoc);
+
+ if (UseInst->getParent() != Def->getParent())
+ continue;
+ if (DefLoc >= UseLoc)
+ continue;
+
+ if (Delta < Distance) {
+ Distance = Delta;
+ UseToBringDefCloserTo = UseInst;
+ }
+ }
+
+ const auto BBE = MBB->instr_end();
+ MachineBasicBlock::iterator DefI = BBE;
+ MachineBasicBlock::iterator UseI = BBE;
+
+ for (auto BBI = MBB->instr_begin(); BBI != BBE; ++BBI) {
+
+ if (DefI != BBE && UseI != BBE)
+ break;
+
+ if ((&*BBI != Def) && (&*BBI != UseToBringDefCloserTo))
+ continue;
+
+ if (&*BBI == Def) {
+ DefI = BBI;
+ continue;
+ }
+
+ if (&*BBI == UseToBringDefCloserTo) {
+ UseI = BBI;
+ continue;
+ }
+ }
+
+ if (DefI == BBE || UseI == BBE)
+ continue;
+
+ DEBUG({
+ dbgs() << "Splicing ";
+ DefI->dump();
+ dbgs() << " right before: ";
+ UseI->dump();
+ });
+
+ Changed = true;
+ MBB->splice(UseI, MBB, DefI);
+ }
+
+ return Changed;
+}
+
+/// Here we find our candidates. What makes an interesting candidate?
+/// An candidate for a canonicalization tree root is normally any kind of
+/// instruction that causes side effects such as a store to memory or a copy to
+/// a physical register or a return instruction. We use these as an expression
+/// tree root that we walk inorder to build a canonical walk which should result
+/// in canoncal vreg renaming.
+static std::vector<MachineInstr *> populateCandidates(MachineBasicBlock *MBB) {
+ std::vector<MachineInstr *> Candidates;
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+ for (auto II = MBB->begin(), IE = MBB->end(); II != IE; ++II) {
+ MachineInstr *MI = &*II;
+
+ bool DoesMISideEffect = false;
+
+ if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg()) {
+ const unsigned Dst = MI->getOperand(0).getReg();
+ DoesMISideEffect |= !TargetRegisterInfo::isVirtualRegister(Dst);
+
+ for (auto UI = MRI.use_begin(Dst); UI != MRI.use_end(); ++UI) {
+ if (DoesMISideEffect) break;
+ DoesMISideEffect |= (UI->getParent()->getParent() != MI->getParent());
+ }
+ }
+
+ if (!MI->mayStore() && !MI->isBranch() && !DoesMISideEffect)
+ continue;
+
+ DEBUG(dbgs() << "Found Candidate: "; MI->dump(););
+ Candidates.push_back(MI);
+ }
+
+ return Candidates;
+}
+
+void doCandidateWalk(std::vector<TypedVReg> &VRegs,
+ std::queue <TypedVReg> &RegQueue,
+ std::vector<MachineInstr *> &VisitedMIs,
+ const MachineBasicBlock *MBB) {
+
+ const MachineFunction &MF = *MBB->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ while (!RegQueue.empty()) {
+
+ auto TReg = RegQueue.front();
+ RegQueue.pop();
+
+ if (TReg.isFrameIndex()) {
+ DEBUG(dbgs() << "Popping frame index.\n";);
+ VRegs.push_back(TypedVReg(RSE_FrameIndex));
+ continue;
+ }
+
+ assert(TReg.isReg() && "Expected vreg or physreg.");
+ unsigned Reg = TReg.getReg();
+
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ DEBUG({
+ dbgs() << "Popping vreg ";
+ MRI.def_begin(Reg)->dump();
+ dbgs() << "\n";
+ });
+
+ if (!llvm::any_of(VRegs, [&](const TypedVReg &TR) {
+ return TR.isReg() && TR.getReg() == Reg;
+ })) {
+ VRegs.push_back(TypedVReg(Reg));
+ }
+ } else {
+ DEBUG(dbgs() << "Popping physreg.\n";);
+ VRegs.push_back(TypedVReg(Reg));
+ continue;
+ }
+
+ for (auto RI = MRI.def_begin(Reg), RE = MRI.def_end(); RI != RE; ++RI) {
+ MachineInstr *Def = RI->getParent();
+
+ if (Def->getParent() != MBB)
+ continue;
+
+ if (llvm::any_of(VisitedMIs,
+ [&](const MachineInstr *VMI) { return Def == VMI; })) {
+ break;
+ }
+
+ DEBUG({
+ dbgs() << "\n========================\n";
+ dbgs() << "Visited MI: ";
+ Def->dump();
+ dbgs() << "BB Name: " << Def->getParent()->getName() << "\n";
+ dbgs() << "\n========================\n";
+ });
+ VisitedMIs.push_back(Def);
+ for (unsigned I = 1, E = Def->getNumOperands(); I != E; ++I) {
+
+ MachineOperand &MO = Def->getOperand(I);
+ if (MO.isFI()) {
+ DEBUG(dbgs() << "Pushing frame index.\n";);
+ RegQueue.push(TypedVReg(RSE_FrameIndex));
+ }
+
+ if (!MO.isReg())
+ continue;
+ RegQueue.push(TypedVReg(MO.getReg()));
+ }
+ }
+ }
+}
+
+// TODO: Work to remove this in the future. One day when we have named vregs
+// we should be able to form the canonical name based on some characteristic
+// we see in that point of the expression tree (like if we were to name based
+// on some sort of value numbering scheme).
+static void SkipVRegs(unsigned &VRegGapIndex, MachineRegisterInfo &MRI,
+ const TargetRegisterClass *RC) {
+ const unsigned VR_GAP = (++VRegGapIndex * 1000);
+
+ DEBUG({
+ dbgs() << "Adjusting per-BB VR_GAP for BB" << VRegGapIndex << " to "
+ << VR_GAP << "\n";
+ });
+
+ unsigned I = MRI.createVirtualRegister(RC);
+ const unsigned E = (((I + VR_GAP) / VR_GAP) + 1) * VR_GAP;
+ while (I != E) {
+ I = MRI.createVirtualRegister(RC);
+ }
+}
+
+static std::map<unsigned, unsigned>
+GetVRegRenameMap(const std::vector<TypedVReg> &VRegs,
+ const std::vector<unsigned> &renamedInOtherBB,
+ MachineRegisterInfo &MRI,
+ const TargetRegisterClass *RC) {
+ std::map<unsigned, unsigned> VRegRenameMap;
+ unsigned LastRenameReg = MRI.createVirtualRegister(RC);
+ bool FirstCandidate = true;
+
+ for (auto &vreg : VRegs) {
+ if (vreg.isFrameIndex()) {
+ // We skip one vreg for any frame index because there is a good chance
+ // (especially when comparing SelectionDAG to GlobalISel generated MIR)
+ // that in the other file we are just getting an incoming vreg that comes
+ // from a copy from a frame index. So it's safe to skip by one.
+ LastRenameReg = MRI.createVirtualRegister(RC);
+ DEBUG(dbgs() << "Skipping rename for FI " << LastRenameReg << "\n";);
+ continue;
+ } else if (vreg.isCandidate()) {
+
+ // After the first candidate, for every subsequent candidate, we skip mod
+ // 10 registers so that the candidates are more likely to start at the
+ // same vreg number making it more likely that the canonical walk from the
+ // candidate insruction. We don't need to skip from the first candidate of
+ // the BasicBlock because we already skip ahead several vregs for each BB.
+ while (LastRenameReg % 10) {
+ if (!FirstCandidate) break;
+ LastRenameReg = MRI.createVirtualRegister(RC);
+
+ DEBUG({
+ dbgs() << "Skipping rename for new candidate " << LastRenameReg
+ << "\n";
+ });
+ }
+ FirstCandidate = false;
+ continue;
+ } else if (!TargetRegisterInfo::isVirtualRegister(vreg.getReg())) {
+ LastRenameReg = MRI.createVirtualRegister(RC);
+ DEBUG({
+ dbgs() << "Skipping rename for Phys Reg " << LastRenameReg << "\n";
+ });
+ continue;
+ }
+
+ auto Reg = vreg.getReg();
+ if (llvm::find(renamedInOtherBB, Reg) != renamedInOtherBB.end()) {
+ DEBUG(dbgs() << "Vreg " << Reg << " already renamed in other BB.\n";);
+ continue;
+ }
+
+ auto Rename = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+ LastRenameReg = Rename;
+
+ if (VRegRenameMap.find(Reg) == VRegRenameMap.end()) {
+ DEBUG(dbgs() << "Mapping vreg ";);
+ if (MRI.reg_begin(Reg) != MRI.reg_end()) {
+ DEBUG(auto foo = &*MRI.reg_begin(Reg); foo->dump(););
+ } else {
+ DEBUG(dbgs() << Reg;);
+ }
+ DEBUG(dbgs() << " to ";);
+ if (MRI.reg_begin(Rename) != MRI.reg_end()) {
+ DEBUG(auto foo = &*MRI.reg_begin(Rename); foo->dump(););
+ } else {
+ DEBUG(dbgs() << Rename;);
+ }
+ DEBUG(dbgs() << "\n";);
+
+ VRegRenameMap.insert(std::pair<unsigned, unsigned>(Reg, Rename));
+ }
+ }
+
+ return VRegRenameMap;
+}
+
+static bool doVRegRenaming(std::vector<unsigned> &RenamedInOtherBB,
+ const std::map<unsigned, unsigned> &VRegRenameMap,
+ MachineRegisterInfo &MRI) {
+ bool Changed = false;
+ for (auto I = VRegRenameMap.begin(), E = VRegRenameMap.end(); I != E; ++I) {
+
+ auto VReg = I->first;
+ auto Rename = I->second;
+
+ RenamedInOtherBB.push_back(Rename);
+
+ std::vector<MachineOperand *> RenameMOs;
+ for (auto &MO : MRI.reg_operands(VReg)) {
+ RenameMOs.push_back(&MO);
+ }
+
+ for (auto *MO : RenameMOs) {
+ Changed = true;
+ MO->setReg(Rename);
+
+ if (!MO->isDef())
+ MO->setIsKill(false);
+ }
+ }
+
+ return Changed;
+}
+
+static bool doDefKillClear(MachineBasicBlock *MBB) {
+ bool Changed = false;
+
+ for (auto &MI : *MBB) {
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ if (!MO.isDef() && MO.isKill()) {
+ Changed = true;
+ MO.setIsKill(false);
+ }
+
+ if (MO.isDef() && MO.isDead()) {
+ Changed = true;
+ MO.setIsDead(false);
+ }
+ }
+ }
+
+ return Changed;
+}
+
+static bool runOnBasicBlock(MachineBasicBlock *MBB,
+ std::vector<StringRef> &bbNames,
+ std::vector<unsigned> &renamedInOtherBB,
+ unsigned &basicBlockNum, unsigned &VRegGapIndex) {
+
+ if (CanonicalizeBasicBlockNumber != ~0U) {
+ if (CanonicalizeBasicBlockNumber != basicBlockNum++)
+ return false;
+ DEBUG(dbgs() << "\n Canonicalizing BasicBlock " << MBB->getName() << "\n";);
+ }
+
+ if (llvm::find(bbNames, MBB->getName()) != bbNames.end()) {
+ DEBUG({
+ dbgs() << "Found potentially duplicate BasicBlocks: " << MBB->getName()
+ << "\n";
+ });
+ return false;
+ }
+
+ DEBUG({
+ dbgs() << "\n\n NEW BASIC BLOCK: " << MBB->getName() << " \n\n";
+ dbgs() << "\n\n================================================\n\n";
+ });
+
+ bool Changed = false;
+ MachineFunction &MF = *MBB->getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ const unsigned DummyVReg = GetDummyVReg(MF);
+ const TargetRegisterClass *DummyRC =
+ (DummyVReg == ~0U) ? nullptr : MRI.getRegClass(DummyVReg);
+ if (!DummyRC) return false;
+
+ bbNames.push_back(MBB->getName());
+ DEBUG(dbgs() << "\n\n NEW BASIC BLOCK: " << MBB->getName() << "\n\n";);
+
+ DEBUG(dbgs() << "MBB Before Scheduling:\n"; MBB->dump(););
+ Changed |= rescheduleCanonically(MBB);
+ DEBUG(dbgs() << "MBB After Scheduling:\n"; MBB->dump(););
+
+ std::vector<MachineInstr *> Candidates = populateCandidates(MBB);
+ std::vector<MachineInstr *> VisitedMIs;
+ std::copy(Candidates.begin(), Candidates.end(),
+ std::back_inserter(VisitedMIs));
+
+ std::vector<TypedVReg> VRegs;
+ for (auto candidate : Candidates) {
+ VRegs.push_back(TypedVReg(RSE_NewCandidate));
+
+ std::queue<TypedVReg> RegQueue;
+
+ // Here we walk the vreg operands of a non-root node along our walk.
+ // The root nodes are the original candidates (stores normally).
+ // These are normally not the root nodes (except for the case of copies to
+ // physical registers).
+ for (unsigned i = 1; i < candidate->getNumOperands(); i++) {
+ if (candidate->mayStore() || candidate->isBranch())
+ break;
+
+ MachineOperand &MO = candidate->getOperand(i);
+ if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
+ continue;
+
+ DEBUG(dbgs() << "Enqueue register"; MO.dump(); dbgs() << "\n";);
+ RegQueue.push(TypedVReg(MO.getReg()));
+ }
+
+ // Here we walk the root candidates. We start from the 0th operand because
+ // the root is normally a store to a vreg.
+ for (unsigned i = 0; i < candidate->getNumOperands(); i++) {
+
+ if (!candidate->mayStore() && !candidate->isBranch())
+ break;
+
+ MachineOperand &MO = candidate->getOperand(i);
+
+ // TODO: Do we want to only add vregs here?
+ if (!MO.isReg() && !MO.isFI())
+ continue;
+
+ DEBUG(dbgs() << "Enqueue Reg/FI"; MO.dump(); dbgs() << "\n";);
+
+ RegQueue.push(MO.isReg() ? TypedVReg(MO.getReg()) :
+ TypedVReg(RSE_FrameIndex));
+ }
+
+ doCandidateWalk(VRegs, RegQueue, VisitedMIs, MBB);
+ }
+
+ // If we have populated no vregs to rename then bail.
+ // The rest of this function does the vreg remaping.
+ if (VRegs.size() == 0)
+ return Changed;
+
+ // Skip some vregs, so we can recon where we'll land next.
+ SkipVRegs(VRegGapIndex, MRI, DummyRC);
+
+ auto VRegRenameMap = GetVRegRenameMap(VRegs, renamedInOtherBB, MRI, DummyRC);
+ Changed |= doVRegRenaming(renamedInOtherBB, VRegRenameMap, MRI);
+ Changed |= doDefKillClear(MBB);
+
+ DEBUG(dbgs() << "Updated MachineBasicBlock:\n"; MBB->dump(); dbgs() << "\n";);
+ DEBUG(dbgs() << "\n\n================================================\n\n");
+ return Changed;
+}
+
+bool MIRCanonicalizer::runOnMachineFunction(MachineFunction &MF) {
+
+ static unsigned functionNum = 0;
+ if (CanonicalizeFunctionNumber != ~0U) {
+ if (CanonicalizeFunctionNumber != functionNum++)
+ return false;
+ DEBUG(dbgs() << "\n Canonicalizing Function " << MF.getName() << "\n";);
+ }
+
+ // we need a valid vreg to create a vreg type for skipping all those
+ // stray vreg numbers so reach alignment/canonical vreg values.
+ std::vector<MachineBasicBlock*> RPOList = GetRPOList(MF);
+
+ DEBUG(
+ dbgs() << "\n\n NEW MACHINE FUNCTION: " << MF.getName() << " \n\n";
+ dbgs() << "\n\n================================================\n\n";
+ dbgs() << "Total Basic Blocks: " << RPOList.size() << "\n";
+ for (auto MBB : RPOList) {
+ dbgs() << MBB->getName() << "\n";
+ }
+ dbgs() << "\n\n================================================\n\n";
+ );
+
+ std::vector<StringRef> BBNames;
+ std::vector<unsigned> RenamedInOtherBB;
+
+ unsigned GapIdx = 0;
+ unsigned BBNum = 0;
+
+ bool Changed = false;
+
+ for (auto MBB : RPOList)
+ Changed |= runOnBasicBlock(MBB, BBNames, RenamedInOtherBB, BBNum, GapIdx);
+
+ return Changed;
+}
+
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index 9c8743a7164..fbba60c4312 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -11,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#include "MILexer.h"
#include "MIParser.h"
+#include "MILexer.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/ArrayRef.h"
@@ -21,8 +21,8 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
#include "llvm/AsmParser/Parser.h"
#include "llvm/AsmParser/SlotMapping.h"
@@ -36,6 +36,7 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
@@ -64,7 +65,6 @@
#include "llvm/Support/SMLoc.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetIntrinsicInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index aae48587c5b..a817c62c985 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -12,16 +12,18 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/MIRPrinter.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -31,19 +33,18 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MIRPrinter.h"
-#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRPrintingPasses.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IRPrintingPasses.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ModuleSlotTracker.h"
#include "llvm/IR/Value.h"
@@ -57,9 +58,8 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/LowLevelTypeImpl.h"
-#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/YAMLTraits.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetIntrinsicInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
@@ -162,8 +162,8 @@ public:
void printStackObjectReference(int FrameIndex);
void printOffset(int64_t Offset);
void printTargetFlags(const MachineOperand &Op);
- void print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
- unsigned I, bool ShouldPrintRegisterTies,
+ void print(const MachineInstr &MI, unsigned OpIdx,
+ const TargetRegisterInfo *TRI, bool ShouldPrintRegisterTies,
LLT TypeToPrint, bool IsDef = false);
void print(const LLVMContext &Context, const TargetInstrInfo &TII,
const MachineMemOperand &Op);
@@ -734,7 +734,7 @@ void MIPrinter::print(const MachineInstr &MI) {
++I) {
if (I)
OS << ", ";
- print(MI.getOperand(I), TRI, I, ShouldPrintRegisterTies,
+ print(MI, I, TRI, ShouldPrintRegisterTies,
getTypeToPrint(MI, I, PrintedTypes, MRI),
/*IsDef=*/true);
}
@@ -751,7 +751,7 @@ void MIPrinter::print(const MachineInstr &MI) {
for (; I < E; ++I) {
if (NeedComma)
OS << ", ";
- print(MI.getOperand(I), TRI, I, ShouldPrintRegisterTies,
+ print(MI, I, TRI, ShouldPrintRegisterTies,
getTypeToPrint(MI, I, PrintedTypes, MRI));
NeedComma = true;
}
@@ -923,9 +923,11 @@ static const char *getTargetIndexName(const MachineFunction &MF, int Index) {
return nullptr;
}
-void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
- unsigned I, bool ShouldPrintRegisterTies, LLT TypeToPrint,
+void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
+ const TargetRegisterInfo *TRI,
+ bool ShouldPrintRegisterTies, LLT TypeToPrint,
bool IsDef) {
+ const MachineOperand &Op = MI.getOperand(OpIdx);
printTargetFlags(Op);
switch (Op.getType()) {
case MachineOperand::MO_Register: {
@@ -959,13 +961,16 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
}
}
if (ShouldPrintRegisterTies && Op.isTied() && !Op.isDef())
- OS << "(tied-def " << Op.getParent()->findTiedOperandIdx(I) << ")";
+ OS << "(tied-def " << Op.getParent()->findTiedOperandIdx(OpIdx) << ")";
if (TypeToPrint.isValid())
OS << '(' << TypeToPrint << ')';
break;
}
case MachineOperand::MO_Immediate:
- OS << Op.getImm();
+ if (MI.isOperandSubregIdx(OpIdx))
+ OS << "%subreg." << TRI->getSubRegIndexName(Op.getImm());
+ else
+ OS << Op.getImm();
break;
case MachineOperand::MO_CImmediate:
Op.getCImm()->printAsOperand(OS, /*PrintType=*/true, MST);
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index d5758da0464..40ffbc46556 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfoMetadata.h"
@@ -30,7 +31,6 @@
#include "llvm/Support/DataTypes.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
@@ -42,6 +42,8 @@ using namespace llvm;
MachineBasicBlock::MachineBasicBlock(MachineFunction &MF, const BasicBlock *B)
: BB(B), Number(-1), xParent(&MF) {
Insts.Parent = this;
+ if (B)
+ IrrLoopHeaderWeight = B->getIrrLoopHeaderWeight();
}
MachineBasicBlock::~MachineBasicBlock() {
@@ -338,6 +340,12 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
}
OS << '\n';
}
+ if (IrrLoopHeaderWeight) {
+ if (Indexes) OS << '\t';
+ OS << " Irreducible loop header weight: "
+ << IrrLoopHeaderWeight.getValue();
+ OS << '\n';
+ }
}
void MachineBasicBlock::printAsOperand(raw_ostream &OS,
diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index 14cd91206d8..2c336e45056 100644
--- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -234,6 +234,12 @@ MachineBlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const {
return MBFI ? MBFI->getProfileCountFromFreq(*F, Freq) : None;
}
+bool
+MachineBlockFrequencyInfo::isIrrLoopHeader(const MachineBasicBlock *MBB) {
+ assert(MBFI && "Expected analysis to be available");
+ return MBFI->isIrrLoopHeader(MBB);
+}
+
const MachineFunction *MachineBlockFrequencyInfo::getFunction() const {
return MBFI ? MBFI->getFunction() : nullptr;
}
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index c5991332f08..9fa4c0d5e4f 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -43,6 +43,7 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/TailDuplicator.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
@@ -55,7 +56,6 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index f0f63715d2f..be197a48d80 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -28,6 +28,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
@@ -35,7 +36,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/RecyclingAllocator.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp
index 9fc990f5c24..5b576b68fcc 100644
--- a/lib/CodeGen/MachineCombiner.cpp
+++ b/lib/CodeGen/MachineCombiner.cpp
@@ -21,11 +21,11 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineTraceMetrics.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index 61f56fffc88..1a39afe655e 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -23,11 +23,11 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cassert>
diff --git a/lib/CodeGen/MachineFrameInfo.cpp b/lib/CodeGen/MachineFrameInfo.cpp
index be8adf75fb7..75c05e2f002 100644
--- a/lib/CodeGen/MachineFrameInfo.cpp
+++ b/lib/CodeGen/MachineFrameInfo.cpp
@@ -16,10 +16,10 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cassert>
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 250a10c7d07..570c410e1fe 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -58,7 +58,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index bb2dda980e4..2c81218f8f6 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -33,6 +33,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
@@ -58,7 +59,6 @@
#include "llvm/Support/LowLevelTypeImpl.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetIntrinsicInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
@@ -320,8 +320,45 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
}
case MachineOperand::MO_MCSymbol:
return getMCSymbol() == Other.getMCSymbol();
- case MachineOperand::MO_CFIIndex:
- return getCFIIndex() == Other.getCFIIndex();
+ case MachineOperand::MO_CFIIndex: {
+ const MachineFunction *MF = getParent()->getParent()->getParent();
+ const MachineFunction *OtherMF =
+ Other.getParent()->getParent()->getParent();
+ MCCFIInstruction Inst = MF->getFrameInstructions()[getCFIIndex()];
+ MCCFIInstruction OtherInst =
+ OtherMF->getFrameInstructions()[Other.getCFIIndex()];
+ MCCFIInstruction::OpType op = Inst.getOperation();
+ if (op != OtherInst.getOperation()) return false;
+ switch (op) {
+ case MCCFIInstruction::OpDefCfa:
+ case MCCFIInstruction::OpOffset:
+ case MCCFIInstruction::OpRelOffset:
+ if (Inst.getRegister() != OtherInst.getRegister()) return false;
+ if (Inst.getOffset() != OtherInst.getOffset()) return false;
+ break;
+ case MCCFIInstruction::OpRestore:
+ case MCCFIInstruction::OpUndefined:
+ case MCCFIInstruction::OpSameValue:
+ case MCCFIInstruction::OpDefCfaRegister:
+ if (Inst.getRegister() != OtherInst.getRegister()) return false;
+ break;
+ case MCCFIInstruction::OpRegister:
+ if (Inst.getRegister() != OtherInst.getRegister()) return false;
+ if (Inst.getRegister2() != OtherInst.getRegister2()) return false;
+ break;
+ case MCCFIInstruction::OpDefCfaOffset:
+ case MCCFIInstruction::OpAdjustCfaOffset:
+ case MCCFIInstruction::OpGnuArgsSize:
+ if (Inst.getOffset() != OtherInst.getOffset()) return false;
+ break;
+ case MCCFIInstruction::OpRememberState:
+ case MCCFIInstruction::OpRestoreState:
+ case MCCFIInstruction::OpEscape:
+ case MCCFIInstruction::OpWindowSave:
+ break;
+ }
+ return true;
+ }
case MachineOperand::MO_Metadata:
return getMetadata() == Other.getMetadata();
case MachineOperand::MO_IntrinsicID:
@@ -370,8 +407,13 @@ hash_code llvm::hash_value(const MachineOperand &MO) {
return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMetadata());
case MachineOperand::MO_MCSymbol:
return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMCSymbol());
- case MachineOperand::MO_CFIIndex:
- return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getCFIIndex());
+ case MachineOperand::MO_CFIIndex: {
+ const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+ MCCFIInstruction Inst = MF->getFrameInstructions()[MO.getCFIIndex()];
+ return hash_combine(MO.getType(), MO.getTargetFlags(), Inst.getOperation(),
+ Inst.getRegister(), Inst.getRegister2(),
+ Inst.getOffset());
+ }
case MachineOperand::MO_IntrinsicID:
return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIntrinsicID());
case MachineOperand::MO_Predicate:
diff --git a/lib/CodeGen/MachineInstrBundle.cpp b/lib/CodeGen/MachineInstrBundle.cpp
index b5621a09c6f..eceae9a968b 100644
--- a/lib/CodeGen/MachineInstrBundle.cpp
+++ b/lib/CodeGen/MachineInstrBundle.cpp
@@ -13,7 +13,7 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index efb5c3371de..d1147fea08e 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -34,6 +34,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/MCInstrDesc.h"
@@ -43,7 +44,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index 1bc869e02e6..48b68e3b718 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -65,11 +65,11 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index c852c2e1564..d270b8e5d8f 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -89,6 +89,7 @@
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/ScheduleDAGInstrs.h"
#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
@@ -102,7 +103,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index be06053f004..1674aba0c82 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -19,6 +19,7 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
@@ -28,7 +29,6 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cassert>
diff --git a/lib/CodeGen/MachineSSAUpdater.cpp b/lib/CodeGen/MachineSSAUpdater.cpp
index 65d82366767..a3e93de67bb 100644
--- a/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/lib/CodeGen/MachineSSAUpdater.cpp
@@ -21,11 +21,11 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 3e12bdcd689..900a0a63e96 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -42,6 +42,7 @@
#include "llvm/CodeGen/ScheduleDFS.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/MC/LaneBitmask.h"
@@ -52,7 +53,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index dd6e26d8f8b..f52e3942664 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -33,6 +33,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/Pass.h"
@@ -40,7 +41,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 0bd5c56871c..f894f470445 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -51,6 +51,7 @@
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/InlineAsm.h"
@@ -66,7 +67,6 @@
#include "llvm/Support/LowLevelTypeImpl.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/MacroFusion.cpp b/lib/CodeGen/MacroFusion.cpp
index 633a853b2c7..13ddad59382 100644
--- a/lib/CodeGen/MacroFusion.cpp
+++ b/lib/CodeGen/MacroFusion.cpp
@@ -19,10 +19,10 @@
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#define DEBUG_TYPE "machine-scheduler"
diff --git a/lib/CodeGen/OptimizePHIs.cpp b/lib/CodeGen/OptimizePHIs.cpp
index 6430e54a59c..530040a3332 100644
--- a/lib/CodeGen/OptimizePHIs.cpp
+++ b/lib/CodeGen/OptimizePHIs.cpp
@@ -20,8 +20,8 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Pass.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cassert>
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index c7f0329b3c5..af26c170cb8 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -31,11 +31,11 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/PatchableFunction.cpp b/lib/CodeGen/PatchableFunction.cpp
index 513e8271656..84dbebfd2b6 100644
--- a/lib/CodeGen/PatchableFunction.cpp
+++ b/lib/CodeGen/PatchableFunction.cpp
@@ -16,8 +16,8 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
using namespace llvm;
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 7cff85a3ab0..c267018c4d1 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -81,6 +81,7 @@
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Pass.h"
@@ -88,7 +89,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/PostRAHazardRecognizer.cpp b/lib/CodeGen/PostRAHazardRecognizer.cpp
index 4a50d895340..9770b336da6 100644
--- a/lib/CodeGen/PostRAHazardRecognizer.cpp
+++ b/lib/CodeGen/PostRAHazardRecognizer.cpp
@@ -31,10 +31,10 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
using namespace llvm;
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index f2249f9e37e..fd92609613b 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -34,12 +34,12 @@
#include "llvm/CodeGen/ScheduleDAGInstrs.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index 0118580a626..feab36d3995 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -13,9 +13,9 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
using namespace llvm;
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index d9e9b3360a0..e41ffc244d9 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -39,6 +39,8 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallingConv.h"
@@ -55,8 +57,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetOptions.h"
@@ -76,12 +76,6 @@ using namespace llvm;
using MBBVector = SmallVector<MachineBasicBlock *, 4>;
-static void spillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS,
- unsigned &MinCSFrameIndex,
- unsigned &MaxCXFrameIndex,
- const MBBVector &SaveBlocks,
- const MBBVector &RestoreBlocks);
-
namespace {
class PEI : public MachineFunctionPass {
@@ -125,6 +119,7 @@ private:
void calculateCallFrameInfo(MachineFunction &Fn);
void calculateSaveRestoreBlocks(MachineFunction &Fn);
+ void spillCalleeSavedRegs(MachineFunction &MF);
void calculateFrameObjectOffsets(MachineFunction &Fn);
void replaceFrameIndices(MachineFunction &Fn);
@@ -197,8 +192,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
// Handle CSR spilling and restoring, for targets that need it.
if (Fn.getTarget().usesPhysRegsForPEI())
- spillCalleeSavedRegs(Fn, RS, MinCSFrameIndex, MaxCSFrameIndex, SaveBlocks,
- RestoreBlocks);
+ spillCalleeSavedRegs(Fn);
// Allow the target machine to make final modifications to the function
// before the frame layout is finalized.
@@ -505,11 +499,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
}
}
-static void spillCalleeSavedRegs(MachineFunction &Fn, RegScavenger *RS,
- unsigned &MinCSFrameIndex,
- unsigned &MaxCSFrameIndex,
- const MBBVector &SaveBlocks,
- const MBBVector &RestoreBlocks) {
+void PEI::spillCalleeSavedRegs(MachineFunction &Fn) {
// We can't list this requirement in getRequiredProperties because some
// targets (WebAssembly) use virtual registers past this point, and the pass
// pipeline is set up without giving the passes a chance to look at the
diff --git a/lib/CodeGen/PseudoSourceValue.cpp b/lib/CodeGen/PseudoSourceValue.cpp
index 5fa5587457d..86fd8745052 100644
--- a/lib/CodeGen/PseudoSourceValue.cpp
+++ b/lib/CodeGen/PseudoSourceValue.cpp
@@ -14,7 +14,7 @@
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/Support/ErrorHandling.h"
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 7061c3ff652..19467ae3b72 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -30,6 +30,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegAllocRegistry.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Metadata.h"
@@ -41,7 +42,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index e74ac79f001..23e5f907c88 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -54,6 +54,7 @@
#include "llvm/CodeGen/RegAllocRegistry.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
@@ -66,7 +67,6 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/Timer.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/RegUsageInfoCollector.cpp b/lib/CodeGen/RegUsageInfoCollector.cpp
index 214c6d2c820..3aaa5a4738d 100644
--- a/lib/CodeGen/RegUsageInfoCollector.cpp
+++ b/lib/CodeGen/RegUsageInfoCollector.cpp
@@ -27,7 +27,7 @@
#include "llvm/CodeGen/RegisterUsageInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
using namespace llvm;
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index 956dec39fc3..8e463ff272d 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -24,7 +24,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 1ef7e41b8ae..84c2e2548ec 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -35,17 +35,17 @@
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DebugLoc.h"
-#include "llvm/Pass.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 844ddb9ed3f..18fe2d85954 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -28,13 +28,13 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
diff --git a/lib/CodeGen/RenameIndependentSubregs.cpp b/lib/CodeGen/RenameIndependentSubregs.cpp
index bd5ecbd28f2..19e0f30ecfc 100644
--- a/lib/CodeGen/RenameIndependentSubregs.cpp
+++ b/lib/CodeGen/RenameIndependentSubregs.cpp
@@ -35,7 +35,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 5e95f760aaa..627bc1946f3 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -19,11 +19,11 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index e2cb8cad6e1..b789e2d9c52 100644
--- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -15,12 +15,12 @@
#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include <cassert>
using namespace llvm;
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 491c56a7314..6e245feb735 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -63,6 +63,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
@@ -98,7 +99,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index b736037d71d..696855f8018 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -19,6 +19,8 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfo.h"
@@ -32,8 +34,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 7a123e3e92e..2760c6cbf86 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -21,12 +21,12 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetSubtargetInfo.h"
using namespace llvm;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index ff49134f7b9..356f2585046 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -40,7 +40,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index b42edf8e751..0d85bccdeac 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -849,13 +849,14 @@ static void transferDbgValues(SelectionDAG &DAG, SDValue From, SDValue To,
break;
DIVariable *Var = Dbg->getVariable();
- auto *Fragment = DIExpression::createFragmentExpression(
- Dbg->getExpression(), OffsetInBits, To.getValueSizeInBits());
- SDDbgValue *Clone =
- DAG.getDbgValue(Var, Fragment, ToNode, To.getResNo(), Dbg->isIndirect(),
- Dbg->getDebugLoc(), Dbg->getOrder());
+ if (auto Fragment = DIExpression::createFragmentExpression(
+ Dbg->getExpression(), OffsetInBits, To.getValueSizeInBits())) {
+ SDDbgValue *Clone = DAG.getDbgValue(Var, *Fragment, ToNode, To.getResNo(),
+ Dbg->isIndirect(), Dbg->getDebugLoc(),
+ Dbg->getOrder());
+ ClonedDVs.push_back(Clone);
+ }
Dbg->setIsInvalidated();
- ClonedDVs.push_back(Clone);
}
for (SDDbgValue *Dbg : ClonedDVs)
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5d6c4998ecd..b55414b51b8 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3844,7 +3844,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
}
LdOps.push_back(L);
-
+ LdOp = L;
LdWidth -= NewVTWidth;
}
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 13799409327..87dcf6e6950 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -18,12 +18,12 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/SchedulerRegistry.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
using namespace llvm;
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 98202925629..24d9d376de2 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -32,6 +32,7 @@
#include "llvm/CodeGen/SchedulerRegistry.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -42,7 +43,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 7ddb0dc07fd..2e1abbe8bb2 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -23,11 +23,11 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
index 631cb34717c..c25315f2983 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -25,11 +25,11 @@
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
#include "llvm/CodeGen/SchedulerRegistry.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <climits>
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index e5de280508b..2fb2615b072 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2893,11 +2893,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
}
case ISD::FrameIndex:
case ISD::TargetFrameIndex:
- if (unsigned Align = InferPtrAlignment(Op)) {
- // The low bits are known zero if the pointer is aligned.
- Known.Zero.setLowBits(Log2_32(Align));
- break;
- }
+ TLI->computeKnownBitsForFrameIndex(Op, Known, DemandedElts, *this, Depth);
break;
default:
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ccc06fa3ee1..f4f8879b5d8 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -55,6 +55,8 @@
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/Argument.h"
@@ -98,8 +100,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetIntrinsicInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
@@ -2585,7 +2585,7 @@ static bool isVectorReductionOp(const User *I) {
case Instruction::FAdd:
case Instruction::FMul:
if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
- if (FPOp->getFastMathFlags().unsafeAlgebra())
+ if (FPOp->getFastMathFlags().isFast())
break;
LLVM_FALLTHROUGH;
default:
@@ -2631,7 +2631,7 @@ static bool isVectorReductionOp(const User *I) {
if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
- if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().unsafeAlgebra())
+ if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().isFast())
return false;
UsersToVisit.push_back(U);
} else if (const ShuffleVectorInst *ShufInst =
@@ -2725,7 +2725,7 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
Flags.setNoInfs(FMF.noInfs());
Flags.setNoNaNs(FMF.noNaNs());
Flags.setNoSignedZeros(FMF.noSignedZeros());
- Flags.setUnsafeAlgebra(FMF.unsafeAlgebra());
+ Flags.setUnsafeAlgebra(FMF.isFast());
SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(),
Op1, Op2, Flags);
@@ -3862,7 +3862,7 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
//
// When the first GEP operand is a single pointer - it is the uniform base we
// are looking for. If first operand of the GEP is a splat vector - we
-// extract the spalt value and use it as a uniform base.
+// extract the splat value and use it as a uniform base.
// In all other cases the function returns 'false'.
static bool getUniformBase(const Value* &Ptr, SDValue& Base, SDValue& Index,
SelectionDAGBuilder* SDB) {
@@ -4828,12 +4828,6 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
MachineFunction &MF = DAG.getMachineFunction();
const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
- // Ignore inlined function arguments here.
- //
- // FIXME: Should we be checking DL->inlinedAt() to determine this?
- if (!Variable->getScope()->getSubprogram()->describes(MF.getFunction()))
- return false;
-
bool IsIndirect = false;
Optional<MachineOperand> Op;
// Some arguments' frame index is recorded during argument lowering.
@@ -4873,11 +4867,13 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
for (unsigned E = I + RegCount; I != E; ++I) {
// The vregs are guaranteed to be allocated in sequence.
Op = MachineOperand::CreateReg(VMI->second + I, false);
- auto *FragmentExpr = DIExpression::createFragmentExpression(
+ auto FragmentExpr = DIExpression::createFragmentExpression(
Expr, Offset, RegisterSize);
+ if (!FragmentExpr)
+ continue;
FuncInfo.ArgDbgValues.push_back(
BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsDbgDeclare,
- Op->getReg(), Variable, FragmentExpr));
+ Op->getReg(), Variable, *FragmentExpr));
Offset += RegisterSize;
}
}
@@ -7959,13 +7955,13 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
switch (Intrinsic) {
case Intrinsic::experimental_vector_reduce_fadd:
- if (FMF.unsafeAlgebra())
+ if (FMF.isFast())
Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2);
else
Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
break;
case Intrinsic::experimental_vector_reduce_fmul:
- if (FMF.unsafeAlgebra())
+ if (FMF.isFast())
Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2);
else
Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 1550347f006..1097fd92ede 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -23,6 +23,7 @@
#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -38,7 +39,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Printable.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetIntrinsicInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 4c4d196427e..ae5eebd9545 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -11,6 +11,7 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/SelectionDAGISel.h"
#include "ScheduleDAGSDNodes.h"
#include "SelectionDAGBuilder.h"
#include "llvm/ADT/APInt.h"
@@ -45,9 +46,9 @@
#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SchedulerRegistry.h"
#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -80,7 +81,6 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/Timer.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetIntrinsicInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index fe553bc986a..1f6fafb039e 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1288,6 +1288,19 @@ void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known.resetAll();
}
+void TargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ assert(isa<FrameIndexSDNode>(Op) && "expected FrameIndex");
+
+ if (unsigned Align = DAG.InferPtrAlignment(Op)) {
+ // The low bits are known zero if the pointer is aligned.
+ Known.Zero.setLowBits(Log2_32(Align));
+ }
+}
+
/// This method can be implemented by targets that want to expose additional
/// information about sign bits to the DAG Combiner.
unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
diff --git a/lib/CodeGen/ShrinkWrap.cpp b/lib/CodeGen/ShrinkWrap.cpp
index 5fb6afee88a..3230aff5ed8 100644
--- a/lib/CodeGen/ShrinkWrap.cpp
+++ b/lib/CodeGen/ShrinkWrap.cpp
@@ -56,15 +56,17 @@
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCAsmInfo.h"
@@ -73,8 +75,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index 3656832a7f1..25a1c37b145 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -10,9 +10,9 @@
#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 1467179b7a3..7a9a65faaca 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -34,6 +34,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/LaneBitmask.h"
@@ -43,7 +44,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 8a47f3d2d6d..69614a55e14 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -28,12 +28,12 @@
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
diff --git a/lib/CodeGen/TailDuplicator.cpp b/lib/CodeGen/TailDuplicator.cpp
index bd3a20f936d..5b76e36382c 100644
--- a/lib/CodeGen/TailDuplicator.cpp
+++ b/lib/CodeGen/TailDuplicator.cpp
@@ -12,13 +12,14 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/TailDuplicator.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -27,14 +28,13 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineSSAUpdater.h"
-#include "llvm/CodeGen/TailDuplicator.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
@@ -603,8 +603,8 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
if (PreRegAlloc && MI.isCall())
return false;
- if (!MI.isPHI() && !MI.isDebugValue())
- InstrCount += 1;
+ if (!MI.isPHI() && !MI.isMetaInstruction())
+ InstrCount += 1;
if (InstrCount > MaxDuplicateCount)
return false;
diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 9dd98b4020d..4d2130f5ebe 100644
--- a/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -20,7 +20,7 @@
#include "llvm/IR/Function.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/Compiler.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Target/TargetRegisterInfo.h"
@@ -104,3 +104,12 @@ unsigned TargetFrameLowering::getStackAlignmentSkew(
return 0;
}
+
+int TargetFrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
+ llvm_unreachable("getInitialCFAOffset() not implemented!");
+}
+
+unsigned TargetFrameLowering::getInitialCFARegister(const MachineFunction &MF)
+ const {
+ llvm_unreachable("getInitialCFARegister() not implemented!");
+} \ No newline at end of file
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index bac12efd639..702b91b7f77 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -11,7 +11,7 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
@@ -19,6 +19,7 @@
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/MC/MCAsmInfo.h"
@@ -26,7 +27,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/TargetOptionsImpl.cpp b/lib/CodeGen/TargetOptionsImpl.cpp
index ed845e1706f..99ff4931e2f 100644
--- a/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/lib/CodeGen/TargetOptionsImpl.cpp
@@ -15,7 +15,7 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Target/TargetSubtargetInfo.h"
using namespace llvm;
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index c5101b1ecfc..59e88ba3bda 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -600,8 +600,14 @@ void TargetPassConfig::addIRPasses() {
addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
}
- if (getOptLevel() != CodeGenOpt::None && EnableMergeICmps) {
- addPass(createMergeICmpsPass());
+ if (getOptLevel() != CodeGenOpt::None) {
+ // The MergeICmpsPass tries to create memcmp calls by grouping sequences of
+ // loads and compares. ExpandMemCmpPass then tries to expand those calls
+ // into optimally-sized loads and compares. The transforms are enabled by a
+ // target lowering hook.
+ if (EnableMergeICmps)
+ addPass(createMergeICmpsPass());
+ addPass(createExpandMemCmpPass());
}
// Run GC lowering passes for builtin collectors
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index 55318237e95..758fdabf5dd 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -27,7 +27,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/Printable.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cassert>
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
index e1db9157f90..659d6f522af 100644
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -16,13 +16,13 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/MC/MCSchedule.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
diff --git a/lib/CodeGen/TargetSubtargetInfo.cpp b/lib/CodeGen/TargetSubtargetInfo.cpp
index 29cfd9fb178..d66272aed9b 100644
--- a/lib/CodeGen/TargetSubtargetInfo.cpp
+++ b/lib/CodeGen/TargetSubtargetInfo.cpp
@@ -14,11 +14,11 @@
#include "llvm/Target/TargetSubtargetInfo.h"
#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/MC/MCInst.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include <string>
using namespace llvm;
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index efd40b209e9..18f1485baca 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -46,6 +46,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/Pass.h"
@@ -54,7 +55,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index bdd25f29aea..5288ca67277 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -30,6 +30,7 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Dominators.h"
@@ -37,7 +38,6 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Type.h"
#include "llvm/Pass.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
static bool eliminateUnreachableBlock(Function &F) {
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 65c62b16719..d499c6c1e73 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -31,12 +31,12 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/Pass.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/XRayInstrumentation.cpp b/lib/CodeGen/XRayInstrumentation.cpp
index 2063ab11a74..fb621cab28b 100644
--- a/lib/CodeGen/XRayInstrumentation.cpp
+++ b/lib/CodeGen/XRayInstrumentation.cpp
@@ -23,10 +23,10 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
#include "llvm/Pass.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
index a88dcfcf542..f593953c62f 100644
--- a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -184,7 +184,7 @@ Optional<DWARFFormValue> DWARFAbbreviationDeclaration::getAttributeValue(
FormValue.setSValue(Spec.getImplicitConstValue());
return FormValue;
}
- if (FormValue.extractValue(DebugInfoData, &Offset, &U))
+ if (FormValue.extractValue(DebugInfoData, &Offset, U.getFormParams(), &U))
return FormValue;
}
// March Offset along until we get to the attribute we want.
diff --git a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index dbe6fe52407..f04ec7706cd 100644
--- a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -90,10 +90,11 @@ std::pair<uint32_t, dwarf::Tag>
DWARFAcceleratorTable::readAtoms(uint32_t &HashDataOffset) {
uint32_t DieOffset = dwarf::DW_INVALID_OFFSET;
dwarf::Tag DieTag = dwarf::DW_TAG_null;
+ DWARFFormParams FormParams = {Hdr.Version, 0, dwarf::DwarfFormat::DWARF32};
for (auto Atom : getAtomsDesc()) {
DWARFFormValue FormValue(Atom.second);
- FormValue.extractValue(AccelSection, &HashDataOffset, NULL);
+ FormValue.extractValue(AccelSection, &HashDataOffset, FormParams);
switch (Atom.first) {
case dwarf::DW_ATOM_die_offset:
DieOffset = *FormValue.getAsUnsignedConstant();
@@ -145,6 +146,7 @@ LLVM_DUMP_METHOD void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
uint32_t Offset = sizeof(Hdr) + Hdr.HeaderDataLength;
unsigned HashesBase = Offset + Hdr.NumBuckets * 4;
unsigned OffsetsBase = HashesBase + Hdr.NumHashes * 4;
+ DWARFFormParams FormParams = {Hdr.Version, 0, dwarf::DwarfFormat::DWARF32};
for (unsigned Bucket = 0; Bucket < Hdr.NumBuckets; ++Bucket) {
unsigned Index = AccelSection.getU32(&Offset);
@@ -181,7 +183,7 @@ LLVM_DUMP_METHOD void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
unsigned i = 0;
for (auto &Atom : AtomForms) {
OS << format("{Atom[%d]: ", i++);
- if (Atom.extractValue(AccelSection, &DataOffset, nullptr))
+ if (Atom.extractValue(AccelSection, &DataOffset, FormParams))
Atom.dump(OS);
else
OS << "Error extracting the value";
@@ -216,8 +218,10 @@ void DWARFAcceleratorTable::ValueIterator::Next() {
NumData = 0;
return;
}
+ DWARFFormParams FormParams = {AccelTable->Hdr.Version, 0,
+ dwarf::DwarfFormat::DWARF32};
for (auto &Atom : AtomForms)
- Atom.extractValue(AccelSection, &DataOffset, nullptr);
+ Atom.extractValue(AccelSection, &DataOffset, FormParams);
++Data;
}
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 24aa666fb81..881cd1dfd11 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -329,9 +329,9 @@ void DWARFContext::dump(
// representation.
OS << "debug_line[" << format("0x%8.8x", Offset) << "]\n";
if (DumpOpts.Verbose) {
- LineTable.parse(lineData, &Offset, &OS);
+ LineTable.parse(lineData, &Offset, &*CU, &OS);
} else {
- LineTable.parse(lineData, &Offset);
+ LineTable.parse(lineData, &Offset, &*CU);
LineTable.dump(OS);
}
}
@@ -349,7 +349,7 @@ void DWARFContext::dump(
DWARFDataExtractor lineData(*DObj, DObj->getLineDWOSection(),
isLittleEndian(), savedAddressByteSize);
DWARFDebugLine::LineTable LineTable;
- while (LineTable.Prologue.parse(lineData, &stmtOffset)) {
+ while (LineTable.Prologue.parse(lineData, &stmtOffset, nullptr)) {
LineTable.dump(OS);
LineTable.clear();
}
@@ -681,7 +681,7 @@ DWARFContext::getLineTableForUnit(DWARFUnit *U) {
// We have to parse it first.
DWARFDataExtractor lineData(*DObj, U->getLineSection(), isLittleEndian(),
U->getAddressByteSize());
- return Line->getOrParseLineTable(lineData, stmtOffset);
+ return Line->getOrParseLineTable(lineData, stmtOffset, U);
}
void DWARFContext::parseCompileUnits() {
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index bd8dd0d0ede..c99c7a9277e 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -144,7 +144,7 @@ parseV5EntryFormat(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
static bool
parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
uint32_t *OffsetPtr, uint64_t EndPrologueOffset,
- const DWARFFormParams &FormParams,
+ const DWARFFormParams &FormParams, const DWARFUnit *U,
std::vector<StringRef> &IncludeDirectories,
std::vector<DWARFDebugLine::FileNameEntry> &FileNames) {
// Get the directory entry description.
@@ -162,7 +162,7 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
DWARFFormValue Value(Descriptor.Form);
switch (Descriptor.Type) {
case DW_LNCT_path:
- if (!Value.extractValue(DebugLineData, OffsetPtr, nullptr))
+ if (!Value.extractValue(DebugLineData, OffsetPtr, FormParams, U))
return false;
IncludeDirectories.push_back(Value.getAsCString().getValue());
break;
@@ -187,7 +187,7 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
DWARFDebugLine::FileNameEntry FileEntry;
for (auto Descriptor : FileDescriptors) {
DWARFFormValue Value(Descriptor.Form);
- if (!Value.extractValue(DebugLineData, OffsetPtr, nullptr))
+ if (!Value.extractValue(DebugLineData, OffsetPtr, FormParams, U))
return false;
switch (Descriptor.Type) {
case DW_LNCT_path:
@@ -213,7 +213,7 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
}
bool DWARFDebugLine::Prologue::parse(const DWARFDataExtractor &DebugLineData,
- uint32_t *OffsetPtr) {
+ uint32_t *OffsetPtr, const DWARFUnit *U) {
const uint64_t PrologueOffset = *OffsetPtr;
clear();
@@ -253,7 +253,8 @@ bool DWARFDebugLine::Prologue::parse(const DWARFDataExtractor &DebugLineData,
if (getVersion() >= 5) {
if (!parseV5DirFileTables(DebugLineData, OffsetPtr, EndPrologueOffset,
- getFormParams(), IncludeDirectories, FileNames)) {
+ getFormParams(), U, IncludeDirectories,
+ FileNames)) {
fprintf(stderr,
"warning: parsing line table prologue at 0x%8.8" PRIx64
" found an invalid directory or file table description at"
@@ -382,24 +383,25 @@ DWARFDebugLine::getLineTable(uint32_t Offset) const {
const DWARFDebugLine::LineTable *
DWARFDebugLine::getOrParseLineTable(const DWARFDataExtractor &DebugLineData,
- uint32_t Offset) {
+ uint32_t Offset, const DWARFUnit *U) {
std::pair<LineTableIter, bool> Pos =
LineTableMap.insert(LineTableMapTy::value_type(Offset, LineTable()));
LineTable *LT = &Pos.first->second;
if (Pos.second) {
- if (!LT->parse(DebugLineData, &Offset))
+ if (!LT->parse(DebugLineData, &Offset, U))
return nullptr;
}
return LT;
}
bool DWARFDebugLine::LineTable::parse(const DWARFDataExtractor &DebugLineData,
- uint32_t *OffsetPtr, raw_ostream *OS) {
+ uint32_t *OffsetPtr, const DWARFUnit *U,
+ raw_ostream *OS) {
const uint32_t DebugLineOffset = *OffsetPtr;
clear();
- if (!Prologue.parse(DebugLineData, OffsetPtr)) {
+ if (!Prologue.parse(DebugLineData, OffsetPtr, U)) {
// Restore our offset and return false to indicate failure!
*OffsetPtr = DebugLineOffset;
return false;
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index d20eabff7f0..a579c06d02e 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -208,7 +208,8 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
DWARFUnit *U = Die.getDwarfUnit();
DWARFFormValue formValue(Form);
- if (!formValue.extractValue(U->getDebugInfoExtractor(), OffsetPtr, U))
+ if (!formValue.extractValue(U->getDebugInfoExtractor(), OffsetPtr,
+ U->getFormParams(), U))
return;
OS << "\t(";
@@ -550,7 +551,7 @@ void DWARFDie::attribute_iterator::updateForIndex(
auto U = Die.getDwarfUnit();
assert(U && "Die must have valid DWARF unit");
bool b = AttrValue.Value.extractValue(U->getDebugInfoExtractor(),
- &ParseOffset, U);
+ &ParseOffset, U->getFormParams(), U);
(void)b;
assert(b && "extractValue cannot fail on fully parsed DWARF");
AttrValue.ByteSize = ParseOffset - AttrValue.Offset;
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index d63e84ef4e7..c4abd49797b 100644
--- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -276,7 +276,8 @@ bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
}
bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
- uint32_t *OffsetPtr, const DWARFUnit *CU) {
+ uint32_t *OffsetPtr, DWARFFormParams FP,
+ const DWARFUnit *CU) {
U = CU;
bool Indirect = false;
bool IsBlock = false;
@@ -288,10 +289,8 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
switch (Form) {
case DW_FORM_addr:
case DW_FORM_ref_addr: {
- if (!U)
- return false;
- uint16_t Size = (Form == DW_FORM_addr) ? U->getAddressByteSize()
- : U->getRefAddrByteSize();
+ uint16_t Size =
+ (Form == DW_FORM_addr) ? FP.AddrSize : FP.getRefAddrByteSize();
Value.uval = Data.getRelocatedValue(Size, OffsetPtr, &Value.SectionIndex);
break;
}
@@ -360,10 +359,8 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
case DW_FORM_GNU_strp_alt:
case DW_FORM_line_strp:
case DW_FORM_strp_sup: {
- if (!U)
- return false;
Value.uval =
- Data.getRelocatedValue(U->getDwarfOffsetByteSize(), OffsetPtr);
+ Data.getRelocatedValue(FP.getDwarfOffsetByteSize(), OffsetPtr);
break;
}
case DW_FORM_flag_present:
diff --git a/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
index dd8e2ac8b53..dee27c621fa 100644
--- a/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -176,9 +176,8 @@ Error PDBFileBuilder::commit(StringRef Filename) {
uint64_t Filesize = Layout.SB->BlockSize * Layout.SB->NumBlocks;
auto OutFileOrError = FileOutputBuffer::create(Filename, Filesize);
- if (OutFileOrError.getError())
- return llvm::make_error<pdb::GenericError>(generic_error_code::invalid_path,
- Filename);
+ if (auto E = OutFileOrError.takeError())
+ return E;
FileBufferByteStream Buffer(std::move(*OutFileOrError),
llvm::support::little);
BinaryStreamWriter Writer(Buffer);
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 3f41a1dc066..0fafe82404e 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -1108,10 +1108,12 @@ static void writeAtomicRMWOperation(raw_ostream &Out,
static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
if (const FPMathOperator *FPO = dyn_cast<const FPMathOperator>(U)) {
- // Unsafe algebra implies all the others, no need to write them all out
- if (FPO->hasUnsafeAlgebra())
+ // 'Fast' is an abbreviation for all fast-math-flags.
+ if (FPO->isFast())
Out << " fast";
else {
+ if (FPO->hasAllowReassoc())
+ Out << " reassoc";
if (FPO->hasNoNaNs())
Out << " nnan";
if (FPO->hasNoInfs())
@@ -1122,6 +1124,8 @@ static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
Out << " arcp";
if (FPO->hasAllowContract())
Out << " contract";
+ if (FPO->hasApproxFunc())
+ Out << " afn";
}
}
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index 07d499bc193..2c9e9be3da5 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -78,6 +78,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
Name=="ssse3.pabs.d.128" || // Added in 6.0
Name.startswith("avx2.pabs.") || // Added in 6.0
Name.startswith("avx512.mask.pabs.") || // Added in 6.0
+ Name.startswith("avx512.broadcastm") || // Added in 6.0
Name.startswith("avx512.mask.pbroadcast") || // Added in 6.0
Name.startswith("sse2.pcmpeq.") || // Added in 3.1
Name.startswith("sse2.pcmpgt.") || // Added in 3.1
@@ -1027,7 +1028,15 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
Rep = Builder.CreateICmp(CmpEq ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_SGT,
CI->getArgOperand(0), CI->getArgOperand(1));
Rep = Builder.CreateSExt(Rep, CI->getType(), "");
- } else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))){
+ } else if (IsX86 && (Name.startswith("avx512.broadcastm"))) {
+ Type *ExtTy = Type::getInt32Ty(C);
+ if (CI->getOperand(0)->getType()->isIntegerTy(8))
+ ExtTy = Type::getInt64Ty(C);
+ unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() /
+ ExtTy->getPrimitiveSizeInBits();
+ Rep = Builder.CreateZExt(CI->getArgOperand(0), ExtTy);
+ Rep = Builder.CreateVectorSplat(NumElts, Rep);
+ } else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))) {
unsigned NumElts =
CI->getArgOperand(1)->getType()->getVectorNumElements();
Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index 2b780adf6c6..22513924a96 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp
@@ -447,3 +447,16 @@ bool BasicBlock::isLandingPad() const {
const LandingPadInst *BasicBlock::getLandingPadInst() const {
return dyn_cast<LandingPadInst>(getFirstNonPHI());
}
+
+Optional<uint64_t> BasicBlock::getIrrLoopHeaderWeight() const {
+ const TerminatorInst *TI = getTerminator();
+ if (MDNode *MDIrrLoopHeader =
+ TI->getMetadata(LLVMContext::MD_irr_loop)) {
+ MDString *MDName = cast<MDString>(MDIrrLoopHeader->getOperand(0));
+ if (MDName->getString().equals("loop_header_weight")) {
+ auto *CI = mdconst::extract<ConstantInt>(MDIrrLoopHeader->getOperand(1));
+ return Optional<uint64_t>(CI->getValue().getZExtValue());
+ }
+ }
+ return Optional<uint64_t>();
+}
diff --git a/lib/IR/CMakeLists.txt b/lib/IR/CMakeLists.txt
index eb4b9143090..17822bbbb5c 100644
--- a/lib/IR/CMakeLists.txt
+++ b/lib/IR/CMakeLists.txt
@@ -22,7 +22,6 @@ add_llvm_library(LLVMCore
DiagnosticPrinter.cpp
Dominators.cpp
Function.cpp
- GCOV.cpp
GVMaterializer.cpp
Globals.cpp
IRBuilder.cpp
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index b34383f5ada..df0c52d4463 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -676,25 +676,7 @@ unsigned llvm::getDebugMetadataVersionFromModule(const Module &M) {
void Instruction::applyMergedLocation(const DILocation *LocA,
const DILocation *LocB) {
- if (LocA && LocB && (LocA == LocB || !LocA->canDiscriminate(*LocB))) {
- setDebugLoc(LocA);
- return;
- }
- if (!LocA || !LocB || !isa<CallInst>(this)) {
- setDebugLoc(nullptr);
- return;
- }
- SmallPtrSet<DILocation *, 5> InlinedLocationsA;
- for (DILocation *L = LocA->getInlinedAt(); L; L = L->getInlinedAt())
- InlinedLocationsA.insert(L);
- const DILocation *Result = LocB;
- for (DILocation *L = LocB->getInlinedAt(); L; L = L->getInlinedAt()) {
- Result = L;
- if (InlinedLocationsA.count(L))
- break;
- }
- setDebugLoc(DILocation::get(
- Result->getContext(), 0, 0, Result->getScope(), Result->getInlinedAt()));
+ setDebugLoc(DILocation::getMergedLocation(LocA, LocB, this));
}
//===----------------------------------------------------------------------===//
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index 645bba1652d..ae02392ea14 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -14,9 +14,11 @@
#include "llvm/IR/DebugInfoMetadata.h"
#include "LLVMContextImpl.h"
#include "MetadataImpl.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/IR/DIBuilder.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
using namespace llvm;
@@ -66,6 +68,31 @@ DILocation *DILocation::getImpl(LLVMContext &Context, unsigned Line,
Storage, Context.pImpl->DILocations);
}
+const DILocation *
+DILocation::getMergedLocation(const DILocation *LocA, const DILocation *LocB,
+ const Instruction *ForInst) {
+ if (!LocA || !LocB)
+ return nullptr;
+
+ if (LocA == LocB || !LocA->canDiscriminate(*LocB))
+ return LocA;
+
+ if (!dyn_cast_or_null<CallInst>(ForInst))
+ return nullptr;
+
+ SmallPtrSet<DILocation *, 5> InlinedLocationsA;
+ for (DILocation *L = LocA->getInlinedAt(); L; L = L->getInlinedAt())
+ InlinedLocationsA.insert(L);
+ const DILocation *Result = LocB;
+ for (DILocation *L = LocB->getInlinedAt(); L; L = L->getInlinedAt()) {
+ Result = L;
+ if (InlinedLocationsA.count(L))
+ break;
+ }
+ return DILocation::get(Result->getContext(), 0, 0, Result->getScope(),
+ Result->getInlinedAt());
+}
+
DINode::DIFlags DINode::getFlag(StringRef Flag) {
return StringSwitch<DIFlags>(Flag)
#define HANDLE_DI_FLAG(ID, NAME) .Case("DIFlag" #NAME, Flag##NAME)
@@ -726,14 +753,23 @@ DIExpression *DIExpression::prepend(const DIExpression *Expr, bool Deref,
return DIExpression::get(Expr->getContext(), Ops);
}
-DIExpression *DIExpression::createFragmentExpression(const DIExpression *Expr,
- unsigned OffsetInBits,
- unsigned SizeInBits) {
+Optional<DIExpression *> DIExpression::createFragmentExpression(
+ const DIExpression *Expr, unsigned OffsetInBits, unsigned SizeInBits) {
SmallVector<uint64_t, 8> Ops;
// Copy over the expression, but leave off any trailing DW_OP_LLVM_fragment.
if (Expr) {
for (auto Op : Expr->expr_ops()) {
- if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
+ switch (Op.getOp()) {
+ default: break;
+ case dwarf::DW_OP_plus:
+ case dwarf::DW_OP_minus:
+ // We can't safely split arithmetic into multiple fragments because we
+ // can't express carry-over between fragments.
+ //
+ // FIXME: We *could* preserve the lowest fragment of a constant offset
+ // operation if the offset fits into SizeInBits.
+ return None;
+ case dwarf::DW_OP_LLVM_fragment: {
// Make the new offset point into the existing fragment.
uint64_t FragmentOffsetInBits = Op.getArg(0);
// Op.getArg(0) is FragmentOffsetInBits.
@@ -741,7 +777,8 @@ DIExpression *DIExpression::createFragmentExpression(const DIExpression *Expr,
assert((OffsetInBits + SizeInBits <= Op.getArg(0) + Op.getArg(1)) &&
"new fragment outside of original fragment");
OffsetInBits += FragmentOffsetInBits;
- break;
+ continue;
+ }
}
Ops.push_back(Op.getOp());
for (unsigned I = 0; I < Op.getNumArgs(); ++I)
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index ceb521c4c48..ffc3a30e6a1 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -146,9 +146,14 @@ bool Instruction::isExact() const {
return cast<PossiblyExactOperator>(this)->isExact();
}
-void Instruction::setHasUnsafeAlgebra(bool B) {
+void Instruction::setFast(bool B) {
assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
- cast<FPMathOperator>(this)->setHasUnsafeAlgebra(B);
+ cast<FPMathOperator>(this)->setFast(B);
+}
+
+void Instruction::setHasAllowReassoc(bool B) {
+ assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+ cast<FPMathOperator>(this)->setHasAllowReassoc(B);
}
void Instruction::setHasNoNaNs(bool B) {
@@ -171,6 +176,11 @@ void Instruction::setHasAllowReciprocal(bool B) {
cast<FPMathOperator>(this)->setHasAllowReciprocal(B);
}
+void Instruction::setHasApproxFunc(bool B) {
+ assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+ cast<FPMathOperator>(this)->setHasApproxFunc(B);
+}
+
void Instruction::setFastMathFlags(FastMathFlags FMF) {
assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
cast<FPMathOperator>(this)->setFastMathFlags(FMF);
@@ -181,9 +191,14 @@ void Instruction::copyFastMathFlags(FastMathFlags FMF) {
cast<FPMathOperator>(this)->copyFastMathFlags(FMF);
}
-bool Instruction::hasUnsafeAlgebra() const {
+bool Instruction::isFast() const {
assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
- return cast<FPMathOperator>(this)->hasUnsafeAlgebra();
+ return cast<FPMathOperator>(this)->isFast();
+}
+
+bool Instruction::hasAllowReassoc() const {
+ assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
+ return cast<FPMathOperator>(this)->hasAllowReassoc();
}
bool Instruction::hasNoNaNs() const {
@@ -211,6 +226,11 @@ bool Instruction::hasAllowContract() const {
return cast<FPMathOperator>(this)->hasAllowContract();
}
+bool Instruction::hasApproxFunc() const {
+ assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
+ return cast<FPMathOperator>(this)->hasApproxFunc();
+}
+
FastMathFlags Instruction::getFastMathFlags() const {
assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
return cast<FPMathOperator>(this)->getFastMathFlags();
@@ -579,7 +599,7 @@ bool Instruction::isAssociative() const {
switch (Opcode) {
case FMul:
case FAdd:
- return cast<FPMathOperator>(this)->hasUnsafeAlgebra();
+ return cast<FPMathOperator>(this)->isFast();
default:
return false;
}
diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp
index a94da5452b8..c8b7c10a9a4 100644
--- a/lib/IR/LLVMContext.cpp
+++ b/lib/IR/LLVMContext.cpp
@@ -60,6 +60,7 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
{MD_absolute_symbol, "absolute_symbol"},
{MD_associated, "associated"},
{MD_callees, "callees"},
+ {MD_irr_loop, "irr_loop"},
};
for (auto &MDKind : MDKinds) {
diff --git a/lib/IR/MDBuilder.cpp b/lib/IR/MDBuilder.cpp
index 54783e884e9..d8e64db7c5d 100644
--- a/lib/IR/MDBuilder.cpp
+++ b/lib/IR/MDBuilder.cpp
@@ -197,3 +197,10 @@ MDNode *MDBuilder::createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType,
}
return MDNode::get(Context, {BaseType, AccessType, createConstant(Off)});
}
+
+MDNode *MDBuilder::createIrrLoopHeaderWeight(uint64_t Weight) {
+ SmallVector<Metadata *, 2> Vals(2);
+ Vals[0] = createString("loop_header_weight");
+ Vals[1] = createConstant(ConstantInt::get(Type::getInt64Ty(Context), Weight));
+ return MDNode::get(Context, Vals);
+}
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index 51a7d424c1f..5df0c6d81cf 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -454,6 +454,28 @@ void Value::replaceUsesOutsideBlock(Value *New, BasicBlock *BB) {
}
}
+void Value::replaceUsesExceptBlockAddr(Value *New) {
+ use_iterator UI = use_begin(), E = use_end();
+ for (; UI != E;) {
+ Use &U = *UI;
+ ++UI;
+
+ if (isa<BlockAddress>(U.getUser()))
+ continue;
+
+ // Must handle Constants specially, we cannot call replaceUsesOfWith on a
+ // constant because they are uniqued.
+ if (auto *C = dyn_cast<Constant>(U.getUser())) {
+ if (!isa<GlobalValue>(C)) {
+ C->handleOperandChange(this, New);
+ continue;
+ }
+ }
+
+ U.set(New);
+ }
+}
+
namespace {
// Various metrics for how much to strip off of pointers.
enum PointerStripKind {
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index c528f7167e7..5bb1f84d2e5 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -115,8 +115,6 @@
using namespace llvm;
-static cl::opt<bool> VerifyDebugInfo("verify-debug-info", cl::init(true));
-
namespace llvm {
struct VerifierSupport {
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 017dd201f9c..9c737795b5a 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -630,6 +630,9 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
NonPrevailingComdats.insert(GV->getComdat());
cast<GlobalObject>(GV)->setComdat(nullptr);
}
+
+ // Set the 'local' flag based on the linker resolution for this symbol.
+ GV->setDSOLocal(Res.FinalDefinitionInLinkageUnit);
}
// Common resolution: collect the maximum size/alignment over all commons.
// We also record if we see an instance of a common as prevailing, so that
@@ -643,7 +646,6 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
CommonRes.Prevailing |= Res.Prevailing;
}
- // FIXME: use proposed local attribute for FinalDefinitionInLinkageUnit.
}
if (!M.getComdatSymbolTable().empty())
for (GlobalValue &GV : M.global_values())
@@ -698,10 +700,10 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
assert(ResI != ResE);
SymbolResolution Res = *ResI++;
- if (Res.Prevailing) {
- if (!Sym.getIRName().empty()) {
- auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier(
- Sym.getIRName(), GlobalValue::ExternalLinkage, ""));
+ if (!Sym.getIRName().empty()) {
+ auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier(
+ Sym.getIRName(), GlobalValue::ExternalLinkage, ""));
+ if (Res.Prevailing) {
ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
// For linker redefined symbols (via --wrap or --defsym) we want to
@@ -713,6 +715,15 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
GUID, BM.getModuleIdentifier()))
S->setLinkage(GlobalValue::WeakAnyLinkage);
}
+
+ // If the linker resolved the symbol to a local definition then mark it
+ // as local in the summary for the module we are adding.
+ if (Res.FinalDefinitionInLinkageUnit) {
+ if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
+ GUID, BM.getModuleIdentifier())) {
+ S->setDSOLocal(true);
+ }
+ }
}
}
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 9759c0c6c1d..87867c54fad 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -469,17 +469,15 @@ void LTOCodeGenerator::restoreLinkageForExternals() {
if (I == ExternalSymbols.end())
return;
- GV.setLinkage(I->second);
- };
-
- std::for_each(MergedModule->begin(), MergedModule->end(), externalize);
- std::for_each(MergedModule->global_begin(), MergedModule->global_end(),
- externalize);
- std::for_each(MergedModule->alias_begin(), MergedModule->alias_end(),
- externalize);
-}
-
-void LTOCodeGenerator::verifyMergedModuleOnce() {
+ GV.setLinkage(I->second);
+ };
+
+ llvm::for_each(MergedModule->functions(), externalize);
+ llvm::for_each(MergedModule->globals(), externalize);
+ llvm::for_each(MergedModule->aliases(), externalize);
+}
+
+void LTOCodeGenerator::verifyMergedModuleOnce() {
// Only run on the first call.
if (HasVerifiedInput)
return;
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 29b14414ea2..9a23e614f3a 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -290,7 +290,7 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
case MCFragment::FT_Padding:
return cast<MCPaddingFragment>(F).getSize();
- case MCFragment::FT_SafeSEH:
+ case MCFragment::FT_SymbolId:
return 4;
case MCFragment::FT_Align: {
@@ -563,8 +563,8 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
break;
}
- case MCFragment::FT_SafeSEH: {
- const MCSafeSEHFragment &SF = cast<MCSafeSEHFragment>(F);
+ case MCFragment::FT_SymbolId: {
+ const MCSymbolIdFragment &SF = cast<MCSymbolIdFragment>(F);
OW->write32(SF.getSymbol()->getIndex());
break;
}
diff --git a/lib/MC/MCFragment.cpp b/lib/MC/MCFragment.cpp
index 94839de14f8..1aed50aaeb7 100644
--- a/lib/MC/MCFragment.cpp
+++ b/lib/MC/MCFragment.cpp
@@ -281,8 +281,8 @@ void MCFragment::destroy() {
case FT_Padding:
delete cast<MCPaddingFragment>(this);
return;
- case FT_SafeSEH:
- delete cast<MCSafeSEHFragment>(this);
+ case FT_SymbolId:
+ delete cast<MCSymbolIdFragment>(this);
return;
case FT_CVInlineLines:
delete cast<MCCVInlineLineTableFragment>(this);
@@ -326,7 +326,7 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
case MCFragment::FT_DwarfFrame: OS << "MCDwarfCallFrameFragment"; break;
case MCFragment::FT_LEB: OS << "MCLEBFragment"; break;
case MCFragment::FT_Padding: OS << "MCPaddingFragment"; break;
- case MCFragment::FT_SafeSEH: OS << "MCSafeSEHFragment"; break;
+ case MCFragment::FT_SymbolId: OS << "MCSymbolIdFragment"; break;
case MCFragment::FT_CVInlineLines: OS << "MCCVInlineLineTableFragment"; break;
case MCFragment::FT_CVDefRange: OS << "MCCVDefRangeTableFragment"; break;
case MCFragment::FT_Dummy: OS << "MCDummyFragment"; break;
@@ -436,8 +436,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
OS << "\n ";
break;
}
- case MCFragment::FT_SafeSEH: {
- const MCSafeSEHFragment *F = cast<MCSafeSEHFragment>(this);
+ case MCFragment::FT_SymbolId: {
+ const MCSymbolIdFragment *F = cast<MCSymbolIdFragment>(this);
OS << "\n ";
OS << " Sym:" << F->getSymbol();
break;
diff --git a/lib/MC/MCWinCOFFStreamer.cpp b/lib/MC/MCWinCOFFStreamer.cpp
index 7e0533b8e00..c2583d95c5e 100644
--- a/lib/MC/MCWinCOFFStreamer.cpp
+++ b/lib/MC/MCWinCOFFStreamer.cpp
@@ -182,7 +182,7 @@ void MCWinCOFFStreamer::EmitCOFFSafeSEH(MCSymbol const *Symbol) {
if (SXData->getAlignment() < 4)
SXData->setAlignment(4);
- new MCSafeSEHFragment(Symbol, SXData);
+ new MCSymbolIdFragment(Symbol, SXData);
getAssembler().registerSymbol(*Symbol);
CSymbol->setIsSafeSEH();
diff --git a/lib/Object/ArchiveWriter.cpp b/lib/Object/ArchiveWriter.cpp
index 919e2676802..63f5082c29d 100644
--- a/lib/Object/ArchiveWriter.cpp
+++ b/lib/Object/ArchiveWriter.cpp
@@ -122,11 +122,11 @@ static void printWithSpacePadding(raw_ostream &OS, T Data, unsigned Size) {
static bool isBSDLike(object::Archive::Kind Kind) {
switch (Kind) {
case object::Archive::K_GNU:
+ case object::Archive::K_GNU64:
return false;
case object::Archive::K_BSD:
case object::Archive::K_DARWIN:
return true;
- case object::Archive::K_GNU64:
case object::Archive::K_DARWIN64:
case object::Archive::K_COFF:
break;
@@ -134,8 +134,8 @@ static bool isBSDLike(object::Archive::Kind Kind) {
llvm_unreachable("not supported for writting");
}
-static void print32(raw_ostream &Out, object::Archive::Kind Kind,
- uint32_t Val) {
+template <class T>
+static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val) {
if (isBSDLike(Kind))
support::endian::Writer<support::little>(Out).write(Val);
else
@@ -216,6 +216,20 @@ static std::string computeRelativePath(StringRef From, StringRef To) {
return Relative.str();
}
+static bool is64BitKind(object::Archive::Kind Kind) {
+ switch (Kind) {
+ case object::Archive::K_GNU:
+ case object::Archive::K_BSD:
+ case object::Archive::K_DARWIN:
+ case object::Archive::K_COFF:
+ return false;
+ case object::Archive::K_DARWIN64:
+ case object::Archive::K_GNU64:
+ return true;
+ }
+ llvm_unreachable("not supported for writting");
+}
+
static void addToStringTable(raw_ostream &Out, StringRef ArcName,
const NewArchiveMember &M, bool Thin) {
StringRef ID = M.Buf->getBufferIdentifier();
@@ -288,6 +302,14 @@ static bool isArchiveSymbol(const object::BasicSymbolRef &S) {
return true;
}
+static void printNBits(raw_ostream &Out, object::Archive::Kind Kind,
+ uint64_t Val) {
+ if (is64BitKind(Kind))
+ print<uint64_t>(Out, Kind, Val);
+ else
+ print<uint32_t>(Out, Kind, Val);
+}
+
static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
bool Deterministic, ArrayRef<MemberData> Members,
StringRef StringTable) {
@@ -299,9 +321,11 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
NumSyms += M.Symbols.size();
unsigned Size = 0;
- Size += 4; // Number of entries
+ Size += is64BitKind(Kind) ? 8 : 4; // Number of entries
if (isBSDLike(Kind))
Size += NumSyms * 8; // Table
+ else if (is64BitKind(Kind))
+ Size += NumSyms * 8; // Table
else
Size += NumSyms * 4; // Table
if (isBSDLike(Kind))
@@ -318,27 +342,30 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
if (isBSDLike(Kind))
printBSDMemberHeader(Out, Out.tell(), "__.SYMDEF", now(Deterministic), 0, 0,
0, Size);
+ else if (is64BitKind(Kind))
+ printGNUSmallMemberHeader(Out, "/SYM64", now(Deterministic), 0, 0, 0, Size);
else
printGNUSmallMemberHeader(Out, "", now(Deterministic), 0, 0, 0, Size);
uint64_t Pos = Out.tell() + Size;
if (isBSDLike(Kind))
- print32(Out, Kind, NumSyms * 8);
+ print<uint32_t>(Out, Kind, NumSyms * 8);
else
- print32(Out, Kind, NumSyms);
+ printNBits(Out, Kind, NumSyms);
for (const MemberData &M : Members) {
for (unsigned StringOffset : M.Symbols) {
if (isBSDLike(Kind))
- print32(Out, Kind, StringOffset);
- print32(Out, Kind, Pos); // member offset
+ print<uint32_t>(Out, Kind, StringOffset);
+ printNBits(Out, Kind, Pos); // member offset
}
Pos += M.Header.size() + M.Data.size() + M.Padding.size();
}
if (isBSDLike(Kind))
- print32(Out, Kind, StringTable.size()); // byte count of the string table
+ // byte count of the string table
+ print<uint32_t>(Out, Kind, StringTable.size());
Out << StringTable;
while (Pad--)
@@ -442,6 +469,25 @@ Error llvm::writeArchive(StringRef ArcName,
if (!StringTableBuf.empty())
Data.insert(Data.begin(), computeStringTable(StringTableBuf));
+ // We would like to detect if we need to switch to a 64-bit symbol table.
+ if (WriteSymtab) {
+ uint64_t MaxOffset = 0;
+ uint64_t LastOffset = MaxOffset;
+ for (const auto& M : Data) {
+ // Record the start of the member's offset
+ LastOffset = MaxOffset;
+ // Account for the size of each part associated with the member.
+ MaxOffset += M.Header.size() + M.Data.size() + M.Padding.size();
+ // We assume 32-bit symbols to see if 32-bit symbols are possible or not.
+ MaxOffset += M.Symbols.size() * 4;
+ }
+ // If LastOffset isn't going to fit in a 32-bit varible we need to switch
+ // to 64-bit. Note that the file can be larger than 4GB as long as the last
+ // member starts before the 4GB offset.
+ if (LastOffset >> 32 != 0)
+ Kind = object::Archive::K_GNU64;
+ }
+
SmallString<128> TmpArchive;
int TmpArchiveFD;
if (auto EC = sys::fs::createUniqueFile(ArcName + ".temp-archive-%%%%%%%.a",
diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp
index ef8c844a66f..c72a1258c1e 100644
--- a/lib/Object/ELF.cpp
+++ b/lib/Object/ELF.cpp
@@ -215,269 +215,6 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
}
template <class ELFT>
-Expected<uint32_t>
-ELFFile<ELFT>::getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms,
- ArrayRef<Elf_Word> ShndxTable) const {
- uint32_t Index = Sym->st_shndx;
- if (Index == ELF::SHN_XINDEX) {
- auto ErrorOrIndex = getExtendedSymbolTableIndex<ELFT>(
- Sym, Syms.begin(), ShndxTable);
- if (!ErrorOrIndex)
- return ErrorOrIndex.takeError();
- return *ErrorOrIndex;
- }
- if (Index == ELF::SHN_UNDEF || Index >= ELF::SHN_LORESERVE)
- return 0;
- return Index;
-}
-
-template <class ELFT>
-Expected<const typename ELFT::Shdr *>
-ELFFile<ELFT>::getSection(const Elf_Sym *Sym, const Elf_Shdr *SymTab,
- ArrayRef<Elf_Word> ShndxTable) const {
- auto SymsOrErr = symbols(SymTab);
- if (!SymsOrErr)
- return SymsOrErr.takeError();
- return getSection(Sym, *SymsOrErr, ShndxTable);
-}
-
-template <class ELFT>
-Expected<const typename ELFT::Shdr *>
-ELFFile<ELFT>::getSection(const Elf_Sym *Sym, Elf_Sym_Range Symbols,
- ArrayRef<Elf_Word> ShndxTable) const {
- auto IndexOrErr = getSectionIndex(Sym, Symbols, ShndxTable);
- if (!IndexOrErr)
- return IndexOrErr.takeError();
- uint32_t Index = *IndexOrErr;
- if (Index == 0)
- return nullptr;
- return getSection(Index);
-}
-
-template <class ELFT>
-Expected<const typename ELFT::Sym *>
-ELFFile<ELFT>::getSymbol(const Elf_Shdr *Sec, uint32_t Index) const {
- auto SymtabOrErr = symbols(Sec);
- if (!SymtabOrErr)
- return SymtabOrErr.takeError();
- return object::getSymbol<ELFT>(*SymtabOrErr, Index);
-}
-
-template <class ELFT>
-Expected<ArrayRef<uint8_t>>
-ELFFile<ELFT>::getSectionContents(const Elf_Shdr *Sec) const {
- return getSectionContentsAsArray<uint8_t>(Sec);
-}
-
-template <class ELFT>
-StringRef ELFFile<ELFT>::getRelocationTypeName(uint32_t Type) const {
- return getELFRelocationTypeName(getHeader()->e_machine, Type);
-}
-
-template <class ELFT>
-void ELFFile<ELFT>::getRelocationTypeName(uint32_t Type,
- SmallVectorImpl<char> &Result) const {
- if (!isMipsELF64()) {
- StringRef Name = getRelocationTypeName(Type);
- Result.append(Name.begin(), Name.end());
- } else {
- // The Mips N64 ABI allows up to three operations to be specified per
- // relocation record. Unfortunately there's no easy way to test for the
- // presence of N64 ELFs as they have no special flag that identifies them
- // as being N64. We can safely assume at the moment that all Mips
- // ELFCLASS64 ELFs are N64. New Mips64 ABIs should provide enough
- // information to disambiguate between old vs new ABIs.
- uint8_t Type1 = (Type >> 0) & 0xFF;
- uint8_t Type2 = (Type >> 8) & 0xFF;
- uint8_t Type3 = (Type >> 16) & 0xFF;
-
- // Concat all three relocation type names.
- StringRef Name = getRelocationTypeName(Type1);
- Result.append(Name.begin(), Name.end());
-
- Name = getRelocationTypeName(Type2);
- Result.append(1, '/');
- Result.append(Name.begin(), Name.end());
-
- Name = getRelocationTypeName(Type3);
- Result.append(1, '/');
- Result.append(Name.begin(), Name.end());
- }
-}
-
-template <class ELFT>
-Expected<const typename ELFT::Sym *>
-ELFFile<ELFT>::getRelocationSymbol(const Elf_Rel *Rel,
- const Elf_Shdr *SymTab) const {
- uint32_t Index = Rel->getSymbol(isMips64EL());
- if (Index == 0)
- return nullptr;
- return getEntry<Elf_Sym>(SymTab, Index);
-}
-
-template <class ELFT>
-Expected<StringRef>
-ELFFile<ELFT>::getSectionStringTable(Elf_Shdr_Range Sections) const {
- uint32_t Index = getHeader()->e_shstrndx;
- if (Index == ELF::SHN_XINDEX)
- Index = Sections[0].sh_link;
-
- if (!Index) // no section string table.
- return "";
- if (Index >= Sections.size())
- return createError("invalid section index");
- return getStringTable(&Sections[Index]);
-}
-
-template <class ELFT> ELFFile<ELFT>::ELFFile(StringRef Object) : Buf(Object) {}
-
-template <class ELFT>
-Expected<ELFFile<ELFT>> ELFFile<ELFT>::create(StringRef Object) {
- if (sizeof(Elf_Ehdr) > Object.size())
- return createError("Invalid buffer");
- return ELFFile(Object);
-}
-
-template <class ELFT>
-Expected<typename ELFT::ShdrRange> ELFFile<ELFT>::sections() const {
- const uintX_t SectionTableOffset = getHeader()->e_shoff;
- if (SectionTableOffset == 0)
- return ArrayRef<Elf_Shdr>();
-
- if (getHeader()->e_shentsize != sizeof(Elf_Shdr))
- return createError(
- "invalid section header entry size (e_shentsize) in ELF header");
-
- const uint64_t FileSize = Buf.size();
-
- if (SectionTableOffset + sizeof(Elf_Shdr) > FileSize)
- return createError("section header table goes past the end of the file");
-
- // Invalid address alignment of section headers
- if (SectionTableOffset & (alignof(Elf_Shdr) - 1))
- return createError("invalid alignment of section headers");
-
- const Elf_Shdr *First =
- reinterpret_cast<const Elf_Shdr *>(base() + SectionTableOffset);
-
- uintX_t NumSections = getHeader()->e_shnum;
- if (NumSections == 0)
- NumSections = First->sh_size;
-
- if (NumSections > UINT64_MAX / sizeof(Elf_Shdr))
- return createError("section table goes past the end of file");
-
- const uint64_t SectionTableSize = NumSections * sizeof(Elf_Shdr);
-
- // Section table goes past end of file!
- if (SectionTableOffset + SectionTableSize > FileSize)
- return createError("section table goes past the end of file");
-
- return makeArrayRef(First, NumSections);
-}
-
-template <class ELFT>
-Expected<const typename ELFT::Shdr *>
-ELFFile<ELFT>::getSection(uint32_t Index) const {
- auto TableOrErr = sections();
- if (!TableOrErr)
- return TableOrErr.takeError();
- return object::getSection<ELFT>(*TableOrErr, Index);
-}
-
-template <class ELFT>
-Expected<StringRef>
-ELFFile<ELFT>::getStringTable(const Elf_Shdr *Section) const {
- if (Section->sh_type != ELF::SHT_STRTAB)
- return createError("invalid sh_type for string table, expected SHT_STRTAB");
- auto V = getSectionContentsAsArray<char>(Section);
- if (!V)
- return V.takeError();
- ArrayRef<char> Data = *V;
- if (Data.empty())
- return createError("empty string table");
- if (Data.back() != '\0')
- return createError("string table non-null terminated");
- return StringRef(Data.begin(), Data.size());
-}
-
-template <class ELFT>
-Expected<ArrayRef<typename ELFT::Word>>
-ELFFile<ELFT>::getSHNDXTable(const Elf_Shdr &Section) const {
- auto SectionsOrErr = sections();
- if (!SectionsOrErr)
- return SectionsOrErr.takeError();
- return getSHNDXTable(Section, *SectionsOrErr);
-}
-
-template <class ELFT>
-Expected<ArrayRef<typename ELFT::Word>>
-ELFFile<ELFT>::getSHNDXTable(const Elf_Shdr &Section,
- Elf_Shdr_Range Sections) const {
- assert(Section.sh_type == ELF::SHT_SYMTAB_SHNDX);
- auto VOrErr = getSectionContentsAsArray<Elf_Word>(&Section);
- if (!VOrErr)
- return VOrErr.takeError();
- ArrayRef<Elf_Word> V = *VOrErr;
- auto SymTableOrErr = object::getSection<ELFT>(Sections, Section.sh_link);
- if (!SymTableOrErr)
- return SymTableOrErr.takeError();
- const Elf_Shdr &SymTable = **SymTableOrErr;
- if (SymTable.sh_type != ELF::SHT_SYMTAB &&
- SymTable.sh_type != ELF::SHT_DYNSYM)
- return createError("invalid sh_type");
- if (V.size() != (SymTable.sh_size / sizeof(Elf_Sym)))
- return createError("invalid section contents size");
- return V;
-}
-
-template <class ELFT>
-Expected<StringRef>
-ELFFile<ELFT>::getStringTableForSymtab(const Elf_Shdr &Sec) const {
- auto SectionsOrErr = sections();
- if (!SectionsOrErr)
- return SectionsOrErr.takeError();
- return getStringTableForSymtab(Sec, *SectionsOrErr);
-}
-
-template <class ELFT>
-Expected<StringRef>
-ELFFile<ELFT>::getStringTableForSymtab(const Elf_Shdr &Sec,
- Elf_Shdr_Range Sections) const {
-
- if (Sec.sh_type != ELF::SHT_SYMTAB && Sec.sh_type != ELF::SHT_DYNSYM)
- return createError(
- "invalid sh_type for symbol table, expected SHT_SYMTAB or SHT_DYNSYM");
- auto SectionOrErr = object::getSection<ELFT>(Sections, Sec.sh_link);
- if (!SectionOrErr)
- return SectionOrErr.takeError();
- return getStringTable(*SectionOrErr);
-}
-
-template <class ELFT>
-Expected<StringRef>
-ELFFile<ELFT>::getSectionName(const Elf_Shdr *Section) const {
- auto SectionsOrErr = sections();
- if (!SectionsOrErr)
- return SectionsOrErr.takeError();
- auto Table = getSectionStringTable(*SectionsOrErr);
- if (!Table)
- return Table.takeError();
- return getSectionName(Section, *Table);
-}
-
-template <class ELFT>
-Expected<StringRef> ELFFile<ELFT>::getSectionName(const Elf_Shdr *Section,
- StringRef DotShstrtab) const {
- uint32_t Offset = Section->sh_name;
- if (Offset == 0)
- return StringRef();
- if (Offset >= DotShstrtab.size())
- return createError("invalid string offset");
- return StringRef(DotShstrtab.data() + Offset);
-}
-
-template <class ELFT>
Expected<std::vector<typename ELFT::Rela>>
ELFFile<ELFT>::android_relas(const Elf_Shdr *Sec) const {
// This function reads relocations in Android's packed relocation format,
diff --git a/lib/ObjectYAML/COFFYAML.cpp b/lib/ObjectYAML/COFFYAML.cpp
index 1103159fc98..056a1aa3ca1 100644
--- a/lib/ObjectYAML/COFFYAML.cpp
+++ b/lib/ObjectYAML/COFFYAML.cpp
@@ -178,6 +178,46 @@ void ScalarEnumerationTraits<COFF::RelocationTypeAMD64>::enumeration(
ECase(IMAGE_REL_AMD64_SSPAN32);
}
+void ScalarEnumerationTraits<COFF::RelocationTypesARM>::enumeration(
+ IO &IO, COFF::RelocationTypesARM &Value) {
+ ECase(IMAGE_REL_ARM_ABSOLUTE);
+ ECase(IMAGE_REL_ARM_ADDR32);
+ ECase(IMAGE_REL_ARM_ADDR32NB);
+ ECase(IMAGE_REL_ARM_BRANCH24);
+ ECase(IMAGE_REL_ARM_BRANCH11);
+ ECase(IMAGE_REL_ARM_TOKEN);
+ ECase(IMAGE_REL_ARM_BLX24);
+ ECase(IMAGE_REL_ARM_BLX11);
+ ECase(IMAGE_REL_ARM_SECTION);
+ ECase(IMAGE_REL_ARM_SECREL);
+ ECase(IMAGE_REL_ARM_MOV32A);
+ ECase(IMAGE_REL_ARM_MOV32T);
+ ECase(IMAGE_REL_ARM_BRANCH20T);
+ ECase(IMAGE_REL_ARM_BRANCH24T);
+ ECase(IMAGE_REL_ARM_BLX23T);
+}
+
+void ScalarEnumerationTraits<COFF::RelocationTypesARM64>::enumeration(
+ IO &IO, COFF::RelocationTypesARM64 &Value) {
+ ECase(IMAGE_REL_ARM64_ABSOLUTE);
+ ECase(IMAGE_REL_ARM64_ADDR32);
+ ECase(IMAGE_REL_ARM64_ADDR32NB);
+ ECase(IMAGE_REL_ARM64_BRANCH26);
+ ECase(IMAGE_REL_ARM64_PAGEBASE_REL21);
+ ECase(IMAGE_REL_ARM64_REL21);
+ ECase(IMAGE_REL_ARM64_PAGEOFFSET_12A);
+ ECase(IMAGE_REL_ARM64_PAGEOFFSET_12L);
+ ECase(IMAGE_REL_ARM64_SECREL);
+ ECase(IMAGE_REL_ARM64_SECREL_LOW12A);
+ ECase(IMAGE_REL_ARM64_SECREL_HIGH12A);
+ ECase(IMAGE_REL_ARM64_SECREL_LOW12L);
+ ECase(IMAGE_REL_ARM64_TOKEN);
+ ECase(IMAGE_REL_ARM64_SECTION);
+ ECase(IMAGE_REL_ARM64_ADDR64);
+ ECase(IMAGE_REL_ARM64_BRANCH19);
+ ECase(IMAGE_REL_ARM64_BRANCH14);
+}
+
void ScalarEnumerationTraits<COFF::WindowsSubsystem>::enumeration(
IO &IO, COFF::WindowsSubsystem &Value) {
ECase(IMAGE_SUBSYSTEM_UNKNOWN);
@@ -378,6 +418,14 @@ void MappingTraits<COFFYAML::Relocation>::mapping(IO &IO,
MappingNormalization<NType<COFF::RelocationTypeAMD64>, uint16_t> NT(
IO, Rel.Type);
IO.mapRequired("Type", NT->Type);
+ } else if (H.Machine == COFF::IMAGE_FILE_MACHINE_ARMNT) {
+ MappingNormalization<NType<COFF::RelocationTypesARM>, uint16_t> NT(
+ IO, Rel.Type);
+ IO.mapRequired("Type", NT->Type);
+ } else if (H.Machine == COFF::IMAGE_FILE_MACHINE_ARM64) {
+ MappingNormalization<NType<COFF::RelocationTypesARM64>, uint16_t> NT(
+ IO, Rel.Type);
+ IO.mapRequired("Type", NT->Type);
} else {
IO.mapRequired("Type", Rel.Type);
}
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index 8796ff56e5e..9abbdba26cb 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -89,6 +89,7 @@
#include "llvm/Transforms/Scalar/ADCE.h"
#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
#include "llvm/Transforms/Scalar/BDCE.h"
+#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
#include "llvm/Transforms/Scalar/ConstantHoisting.h"
#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
#include "llvm/Transforms/Scalar/DCE.h"
@@ -548,6 +549,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
EarlyFPM.addPass(SROA());
EarlyFPM.addPass(EarlyCSEPass());
EarlyFPM.addPass(LowerExpectIntrinsicPass());
+ if (Level == O3)
+ EarlyFPM.addPass(CallSiteSplittingPass());
+
// In SamplePGO ThinLTO backend, we need instcombine before profile annotation
// to convert bitcast to direct calls so that they can be inlined during the
// profile annotation prepration step.
@@ -915,13 +919,16 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
MPM.addPass(InferFunctionAttrsPass());
if (Level > 1) {
+ FunctionPassManager EarlyFPM(DebugLogging);
+ EarlyFPM.addPass(CallSiteSplittingPass());
+ MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM)));
+
// Indirect call promotion. This should promote all the targets that are
// left by the earlier promotion pass that promotes intra-module targets.
// This two-step promotion is to save the compile time. For LTO, it should
// produce the same result as if we only do promotion here.
MPM.addPass(PGOIndirectCallPromotion(
true /* InLTO */, PGOOpt && !PGOOpt->SampleProfileFile.empty()));
-
// Propagate constants at call sites into the functions they call. This
// opens opportunities for globalopt (and inlining) by substituting function
// pointers passed as arguments to direct uses of functions.
diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def
index 20d1220ac33..40b884351fd 100644
--- a/lib/Passes/PassRegistry.def
+++ b/lib/Passes/PassRegistry.def
@@ -140,6 +140,7 @@ FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass())
FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass())
FUNCTION_PASS("bdce", BDCEPass())
FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass())
+FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass())
FUNCTION_PASS("consthoist", ConstantHoistingPass())
FUNCTION_PASS("correlated-propagation", CorrelatedValuePropagationPass())
FUNCTION_PASS("dce", DCEPass())
diff --git a/lib/ProfileData/CMakeLists.txt b/lib/ProfileData/CMakeLists.txt
index cd65762ae6a..3a981d8acf4 100644
--- a/lib/ProfileData/CMakeLists.txt
+++ b/lib/ProfileData/CMakeLists.txt
@@ -1,4 +1,5 @@
add_llvm_library(LLVMProfileData
+ GCOV.cpp
InstrProf.cpp
InstrProfReader.cpp
InstrProfWriter.cpp
diff --git a/lib/IR/GCOV.cpp b/lib/ProfileData/GCOV.cpp
index d4b45522822..d6e44389f2b 100644
--- a/lib/IR/GCOV.cpp
+++ b/lib/ProfileData/GCOV.cpp
@@ -12,7 +12,7 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Support/GCOV.h"
+#include "llvm/ProfileData/GCOV.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/FileSystem.h"
diff --git a/lib/Support/Chrono.cpp b/lib/Support/Chrono.cpp
index a39b485bd13..84f5aab6fc4 100644
--- a/lib/Support/Chrono.cpp
+++ b/lib/Support/Chrono.cpp
@@ -65,17 +65,17 @@ void format_provider<TimePoint<>>::format(const TimePoint<> &T, raw_ostream &OS,
if (Style[I] == '%' && Style.size() > I + 1) switch (Style[I + 1]) {
case 'L': // Milliseconds, from Ruby.
FStream << llvm::format(
- "%.3lu", duration_cast<milliseconds>(Fractional).count());
+ "%.3lu", (long)duration_cast<milliseconds>(Fractional).count());
++I;
continue;
case 'f': // Microseconds, from Python.
FStream << llvm::format(
- "%.6lu", duration_cast<microseconds>(Fractional).count());
+ "%.6lu", (long)duration_cast<microseconds>(Fractional).count());
++I;
continue;
case 'N': // Nanoseconds, from date(1).
FStream << llvm::format(
- "%.6lu", duration_cast<nanoseconds>(Fractional).count());
+ "%.6lu", (long)duration_cast<nanoseconds>(Fractional).count());
++I;
continue;
case '%': // Consume %%, so %%f parses as (%%)f not %(%f)
diff --git a/lib/Support/FileOutputBuffer.cpp b/lib/Support/FileOutputBuffer.cpp
index 1e20b01fc4a..8906be3aaa2 100644
--- a/lib/Support/FileOutputBuffer.cpp
+++ b/lib/Support/FileOutputBuffer.cpp
@@ -38,7 +38,7 @@ public:
std::unique_ptr<fs::mapped_file_region> Buf)
: FileOutputBuffer(Path), Buffer(std::move(Buf)), TempPath(TempPath) {}
- static ErrorOr<std::unique_ptr<OnDiskBuffer>>
+ static Expected<std::unique_ptr<OnDiskBuffer>>
create(StringRef Path, size_t Size, unsigned Mode);
uint8_t *getBufferStart() const override { return (uint8_t *)Buffer->data(); }
@@ -49,14 +49,14 @@ public:
size_t getBufferSize() const override { return Buffer->size(); }
- std::error_code commit() override {
+ Error commit() override {
// Unmap buffer, letting OS flush dirty pages to file on disk.
Buffer.reset();
// Atomically replace the existing file with the new one.
auto EC = fs::rename(TempPath, FinalPath);
sys::DontRemoveFileOnSignal(TempPath);
- return EC;
+ return errorCodeToError(EC);
}
~OnDiskBuffer() override {
@@ -78,13 +78,13 @@ public:
InMemoryBuffer(StringRef Path, MemoryBlock Buf, unsigned Mode)
: FileOutputBuffer(Path), Buffer(Buf), Mode(Mode) {}
- static ErrorOr<std::unique_ptr<InMemoryBuffer>>
+ static Expected<std::unique_ptr<InMemoryBuffer>>
create(StringRef Path, size_t Size, unsigned Mode) {
std::error_code EC;
MemoryBlock MB = Memory::allocateMappedMemory(
Size, nullptr, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC);
if (EC)
- return EC;
+ return errorCodeToError(EC);
return llvm::make_unique<InMemoryBuffer>(Path, MB, Mode);
}
@@ -96,14 +96,14 @@ public:
size_t getBufferSize() const override { return Buffer.size(); }
- std::error_code commit() override {
+ Error commit() override {
int FD;
std::error_code EC;
if (auto EC = openFileForWrite(FinalPath, FD, fs::F_None, Mode))
- return EC;
+ return errorCodeToError(EC);
raw_fd_ostream OS(FD, /*shouldClose=*/true, /*unbuffered=*/true);
OS << StringRef((const char *)Buffer.base(), Buffer.size());
- return std::error_code();
+ return Error::success();
}
private:
@@ -111,13 +111,13 @@ private:
unsigned Mode;
};
-ErrorOr<std::unique_ptr<OnDiskBuffer>>
+Expected<std::unique_ptr<OnDiskBuffer>>
OnDiskBuffer::create(StringRef Path, size_t Size, unsigned Mode) {
// Create new file in same directory but with random name.
SmallString<128> TempPath;
int FD;
if (auto EC = fs::createUniqueFile(Path + ".tmp%%%%%%%", FD, TempPath, Mode))
- return EC;
+ return errorCodeToError(EC);
sys::RemoveFileOnSignal(TempPath);
@@ -128,7 +128,7 @@ OnDiskBuffer::create(StringRef Path, size_t Size, unsigned Mode) {
// pretty slow just like it writes specified amount of bytes,
// so we should avoid calling that function.
if (auto EC = fs::resize_file(FD, Size))
- return EC;
+ return errorCodeToError(EC);
#endif
// Mmap it.
@@ -137,12 +137,12 @@ OnDiskBuffer::create(StringRef Path, size_t Size, unsigned Mode) {
FD, fs::mapped_file_region::readwrite, Size, 0, EC);
close(FD);
if (EC)
- return EC;
+ return errorCodeToError(EC);
return llvm::make_unique<OnDiskBuffer>(Path, TempPath, std::move(MappedFile));
}
// Create an instance of FileOutputBuffer.
-ErrorOr<std::unique_ptr<FileOutputBuffer>>
+Expected<std::unique_ptr<FileOutputBuffer>>
FileOutputBuffer::create(StringRef Path, size_t Size, unsigned Flags) {
unsigned Mode = fs::all_read | fs::all_write;
if (Flags & F_executable)
@@ -161,7 +161,7 @@ FileOutputBuffer::create(StringRef Path, size_t Size, unsigned Flags) {
// destination file and write to it on commit().
switch (Stat.type()) {
case fs::file_type::directory_file:
- return errc::is_a_directory;
+ return errorCodeToError(errc::is_a_directory);
case fs::file_type::regular_file:
case fs::file_type::file_not_found:
case fs::file_type::status_error:
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index d8fb3e1dc1d..5b2a0f1d0c2 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -351,12 +351,14 @@ enum ProcessorTypes {
INTEL_PENTIUM_IV,
INTEL_PENTIUM_M,
INTEL_CORE_DUO,
- INTEL_X86_64,
INTEL_NOCONA,
INTEL_PRESCOTT,
AMD_i486,
AMDPENTIUM,
- AMDATHLON,
+ AMD_ATHLON,
+ AMD_ATHLON_XP,
+ AMD_K8,
+ AMD_K8SSE3,
INTEL_GOLDMONT,
CPU_TYPE_MAX
};
@@ -385,10 +387,6 @@ enum ProcessorSubtypes {
AMDPENTIUM_K62,
AMDPENTIUM_K63,
AMDPENTIUM_GEODE,
- AMDATHLON_CLASSIC,
- AMDATHLON_XP,
- AMDATHLON_K8,
- AMDATHLON_K8SSE3,
CPU_SUBTYPE_MAX
};
@@ -794,8 +792,13 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
break;
}
if (Features2 & (1 << (FEATURE_EM64T - 32))) {
- *Type = INTEL_X86_64;
- break; // x86-64
+ *Type = INTEL_CORE2; // "core2"
+ *Subtype = INTEL_CORE2_65;
+ break;
+ }
+ if (Features & (1 << FEATURE_SSE3)) {
+ *Type = INTEL_CORE_DUO;
+ break;
}
if (Features & (1 << FEATURE_SSE2)) {
*Type = INTEL_PENTIUM_M;
@@ -814,40 +817,15 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
}
break;
case 15: {
- switch (Model) {
- case 0: // Pentium 4 processor, Intel Xeon processor. All processors are
- // model 00h and manufactured using the 0.18 micron process.
- case 1: // Pentium 4 processor, Intel Xeon processor, Intel Xeon
- // processor MP, and Intel Celeron processor. All processors are
- // model 01h and manufactured using the 0.18 micron process.
- case 2: // Pentium 4 processor, Mobile Intel Pentium 4 processor - M,
- // Intel Xeon processor, Intel Xeon processor MP, Intel Celeron
- // processor, and Mobile Intel Celeron processor. All processors
- // are model 02h and manufactured using the 0.13 micron process.
- *Type = ((Features2 & (1 << (FEATURE_EM64T - 32))) ? INTEL_X86_64
- : INTEL_PENTIUM_IV);
+ if (Features2 & (1 << (FEATURE_EM64T - 32))) {
+ *Type = INTEL_NOCONA;
break;
-
- case 3: // Pentium 4 processor, Intel Xeon processor, Intel Celeron D
- // processor. All processors are model 03h and manufactured using
- // the 90 nm process.
- case 4: // Pentium 4 processor, Pentium 4 processor Extreme Edition,
- // Pentium D processor, Intel Xeon processor, Intel Xeon
- // processor MP, Intel Celeron D processor. All processors are
- // model 04h and manufactured using the 90 nm process.
- case 6: // Pentium 4 processor, Pentium D processor, Pentium processor
- // Extreme Edition, Intel Xeon processor, Intel Xeon processor
- // MP, Intel Celeron D processor. All processors are model 06h
- // and manufactured using the 65 nm process.
- *Type = ((Features2 & (1 << (FEATURE_EM64T - 32))) ? INTEL_NOCONA
- : INTEL_PRESCOTT);
- break;
-
- default:
- *Type = ((Features2 & (1 << (FEATURE_EM64T - 32))) ? INTEL_X86_64
- : INTEL_PENTIUM_IV);
+ }
+ if (Features & (1 << FEATURE_SSE3)) {
+ *Type = INTEL_PRESCOTT;
break;
}
+ *Type = INTEL_PENTIUM_IV;
break;
}
default:
@@ -885,20 +863,18 @@ static void getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
}
break;
case 6:
- *Type = AMDATHLON;
if (Features & (1 << FEATURE_SSE)) {
- *Subtype = AMDATHLON_XP;
+ *Type = AMD_ATHLON_XP;
break; // "athlon-xp"
}
- *Subtype = AMDATHLON_CLASSIC;
+ *Type = AMD_ATHLON;
break; // "athlon"
case 15:
- *Type = AMDATHLON;
if (Features & (1 << FEATURE_SSE3)) {
- *Subtype = AMDATHLON_K8SSE3;
+ *Type = AMD_K8SSE3;
break; // "k8-sse3"
}
- *Subtype = AMDATHLON_K8;
+ *Type = AMD_K8;
break; // "k8"
case 16:
*Type = AMDFAM10H; // "amdfam10"
@@ -1078,8 +1054,8 @@ StringRef sys::getHostCPUName() {
detectX86FamilyModel(EAX, &Family, &Model);
getAvailableFeatures(ECX, EDX, MaxLeaf, &Features, &Features2);
- unsigned Type;
- unsigned Subtype;
+ unsigned Type = 0;
+ unsigned Subtype = 0;
if (Vendor == SIG_INTEL) {
getIntelProcessorTypeAndSubtype(Family, Model, Brand_id, Features,
@@ -1145,8 +1121,6 @@ StringRef sys::getHostCPUName() {
return "knl";
case INTEL_KNM:
return "knm";
- case INTEL_X86_64:
- return "x86-64";
case INTEL_NOCONA:
return "nocona";
case INTEL_PRESCOTT:
@@ -1172,19 +1146,14 @@ StringRef sys::getHostCPUName() {
default:
return "pentium";
}
- case AMDATHLON:
- switch (Subtype) {
- case AMDATHLON_CLASSIC:
- return "athlon";
- case AMDATHLON_XP:
- return "athlon-xp";
- case AMDATHLON_K8:
- return "k8";
- case AMDATHLON_K8SSE3:
- return "k8-sse3";
- default:
- llvm_unreachable("Unexpected subtype!");
- }
+ case AMD_ATHLON:
+ return "athlon";
+ case AMD_ATHLON_XP:
+ return "athlon-xp";
+ case AMD_K8:
+ return "k8";
+ case AMD_K8SSE3:
+ return "k8-sse3";
case AMDFAM10H:
return "amdfam10";
case AMD_BTVER1:
diff --git a/lib/Support/LowLevelType.cpp b/lib/Support/LowLevelType.cpp
index 0ee3f1d0119..cb2187405d6 100644
--- a/lib/Support/LowLevelType.cpp
+++ b/lib/Support/LowLevelType.cpp
@@ -43,7 +43,7 @@ void LLT::print(raw_ostream &OS) const {
assert(isScalar() && "unexpected type");
OS << "s" << getScalarSizeInBits();
} else
- llvm_unreachable("trying to print an invalid type");
+ OS << "LLT_invalid";
}
const constexpr LLT::BitFieldInfo LLT::ScalarSizeFieldInfo;
diff --git a/lib/Support/SpecialCaseList.cpp b/lib/Support/SpecialCaseList.cpp
index a659a2afee6..bf807e66e02 100644
--- a/lib/Support/SpecialCaseList.cpp
+++ b/lib/Support/SpecialCaseList.cpp
@@ -27,6 +27,7 @@
namespace llvm {
bool SpecialCaseList::Matcher::insert(std::string Regexp,
+ unsigned LineNumber,
std::string &REError) {
if (Regexp.empty()) {
REError = "Supplied regexp was blank";
@@ -34,7 +35,7 @@ bool SpecialCaseList::Matcher::insert(std::string Regexp,
}
if (Regex::isLiteralERE(Regexp)) {
- Strings.insert(Regexp);
+ Strings[Regexp] = LineNumber;
return true;
}
Trigrams.insert(Regexp);
@@ -45,34 +46,30 @@ bool SpecialCaseList::Matcher::insert(std::string Regexp,
Regexp.replace(pos, strlen("*"), ".*");
}
+ Regexp = (Twine("^(") + StringRef(Regexp) + ")$").str();
+
// Check that the regexp is valid.
Regex CheckRE(Regexp);
if (!CheckRE.isValid(REError))
return false;
- if (!UncompiledRegEx.empty())
- UncompiledRegEx += "|";
- UncompiledRegEx += "^(" + Regexp + ")$";
+ RegExes.emplace_back(
+ std::make_pair(make_unique<Regex>(std::move(CheckRE)), LineNumber));
return true;
}
-void SpecialCaseList::Matcher::compile() {
- if (!UncompiledRegEx.empty()) {
- RegEx.reset(new Regex(UncompiledRegEx));
- UncompiledRegEx.clear();
- }
-}
-
-bool SpecialCaseList::Matcher::match(StringRef Query) const {
- if (Strings.count(Query))
- return true;
+unsigned SpecialCaseList::Matcher::match(StringRef Query) const {
+ auto It = Strings.find(Query);
+ if (It != Strings.end())
+ return It->second;
if (Trigrams.isDefinitelyOut(Query))
return false;
- return RegEx && RegEx->match(Query);
+ for (auto& RegExKV : RegExes)
+ if (RegExKV.first->match(Query))
+ return RegExKV.second;
+ return 0;
}
-SpecialCaseList::SpecialCaseList() : Sections(), IsCompiled(false) {}
-
std::unique_ptr<SpecialCaseList>
SpecialCaseList::create(const std::vector<std::string> &Paths,
std::string &Error) {
@@ -114,7 +111,6 @@ bool SpecialCaseList::createInternal(const std::vector<std::string> &Paths,
return false;
}
}
- compile();
return true;
}
@@ -123,7 +119,6 @@ bool SpecialCaseList::createInternal(const MemoryBuffer *MB,
StringMap<size_t> Sections;
if (!parse(MB, Sections, Error))
return false;
- compile();
return true;
}
@@ -132,11 +127,13 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB,
std::string &Error) {
// Iterate through each line in the blacklist file.
SmallVector<StringRef, 16> Lines;
- SplitString(MB->getBuffer(), Lines, "\n\r");
+ MB->getBuffer().split(Lines, '\n');
- int LineNo = 1;
+ unsigned LineNo = 1;
StringRef Section = "*";
+
for (auto I = Lines.begin(), E = Lines.end(); I != E; ++I, ++LineNo) {
+ *I = I->trim();
// Ignore empty lines and lines starting with "#"
if (I->empty() || I->startswith("#"))
continue;
@@ -181,11 +178,10 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB,
if (SectionsMap.find(Section) == SectionsMap.end()) {
std::unique_ptr<Matcher> M = make_unique<Matcher>();
std::string REError;
- if (!M->insert(Section, REError)) {
+ if (!M->insert(Section, LineNo, REError)) {
Error = (Twine("malformed section ") + Section + ": '" + REError).str();
return false;
}
- M->compile();
SectionsMap[Section] = Sections.size();
Sections.emplace_back(std::move(M));
@@ -193,7 +189,7 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB,
auto &Entry = Sections[SectionsMap[Section]].Entries[Prefix][Category];
std::string REError;
- if (!Entry.insert(std::move(Regexp), REError)) {
+ if (!Entry.insert(std::move(Regexp), LineNo, REError)) {
Error = (Twine("malformed regex in line ") + Twine(LineNo) + ": '" +
SplitLine.second + "': " + REError).str();
return false;
@@ -202,38 +198,33 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB,
return true;
}
-void SpecialCaseList::compile() {
- assert(!IsCompiled && "compile() should only be called once");
- // Iterate through every section compiling regular expressions for every query
- // and creating Section entries.
- for (auto &Section : Sections)
- for (auto &Prefix : Section.Entries)
- for (auto &Category : Prefix.getValue())
- Category.getValue().compile();
-
- IsCompiled = true;
-}
-
SpecialCaseList::~SpecialCaseList() {}
bool SpecialCaseList::inSection(StringRef Section, StringRef Prefix,
StringRef Query, StringRef Category) const {
- assert(IsCompiled && "SpecialCaseList::compile() was not called!");
+ return inSectionBlame(Section, Prefix, Query, Category);
+}
+unsigned SpecialCaseList::inSectionBlame(StringRef Section, StringRef Prefix,
+ StringRef Query,
+ StringRef Category) const {
for (auto &SectionIter : Sections)
- if (SectionIter.SectionMatcher->match(Section) &&
- inSection(SectionIter.Entries, Prefix, Query, Category))
- return true;
-
- return false;
+ if (SectionIter.SectionMatcher->match(Section)) {
+ unsigned Blame =
+ inSectionBlame(SectionIter.Entries, Prefix, Query, Category);
+ if (Blame)
+ return Blame;
+ }
+ return 0;
}
-bool SpecialCaseList::inSection(const SectionEntries &Entries, StringRef Prefix,
- StringRef Query, StringRef Category) const {
+unsigned SpecialCaseList::inSectionBlame(const SectionEntries &Entries,
+ StringRef Prefix, StringRef Query,
+ StringRef Category) const {
SectionEntries::const_iterator I = Entries.find(Prefix);
- if (I == Entries.end()) return false;
+ if (I == Entries.end()) return 0;
StringMap<Matcher>::const_iterator II = I->second.find(Category);
- if (II == I->second.end()) return false;
+ if (II == I->second.end()) return 0;
return II->getValue().match(Query);
}
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 781a911ed57..2ecb97316c8 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -426,7 +426,7 @@ std::error_code resize_file(int FD, uint64_t Size) {
// If we have posix_fallocate use it. Unlike ftruncate it always allocates
// space, so we get an error if the disk is full.
if (int Err = ::posix_fallocate(FD, 0, Size)) {
- if (Err != EOPNOTSUPP)
+ if (Err != EINVAL && Err != EOPNOTSUPP)
return std::error_code(Err, std::generic_category());
}
#endif
diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp
index e6afb42440a..7de5d0ef66b 100644
--- a/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -22,9 +22,9 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h
index bc44bc5f246..461c01318d4 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.h
+++ b/lib/Target/AArch64/AArch64CallingConvention.h
@@ -19,8 +19,8 @@
#include "AArch64InstrInfo.h"
#include "AArch64Subtarget.h"
#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/CallingConv.h"
-#include "llvm/Target/TargetInstrInfo.h"
namespace {
using namespace llvm;
diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp
index 51700f90597..2793481a0c1 100644
--- a/lib/Target/AArch64/AArch64CondBrTuning.cpp
+++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -34,9 +34,9 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineTraceMetrics.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index 8bbef44a2e6..c3a354cb01d 100644
--- a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -73,11 +73,11 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cassert>
#include <cstdlib>
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 9eda56c825a..33e0f5de5fd 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -31,10 +31,10 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineTraceMetrics.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 5d8b4b69593..d182c812189 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -20,9 +20,9 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
using namespace llvm;
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 8ebcaff1358..809e4a77fad 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -110,6 +110,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DataLayout.h"
@@ -121,7 +122,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index c351efb0c39..55a256867fa 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -14,7 +14,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
diff --git a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index 7d2cfbeff38..39f50ade747 100644
--- a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -14,19 +14,21 @@
namespace llvm {
RegisterBankInfo::PartialMapping AArch64GenRegisterBankInfo::PartMappings[]{
/* StartIdx, Length, RegBank */
- // 0: FPR 32-bit value.
+ // 0: FPR 16-bit value.
+ {0, 16, AArch64::FPRRegBank},
+ // 1: FPR 32-bit value.
{0, 32, AArch64::FPRRegBank},
- // 1: FPR 64-bit value.
+ // 2: FPR 64-bit value.
{0, 64, AArch64::FPRRegBank},
- // 2: FPR 128-bit value.
+ // 3: FPR 128-bit value.
{0, 128, AArch64::FPRRegBank},
- // 3: FPR 256-bit value.
+ // 4: FPR 256-bit value.
{0, 256, AArch64::FPRRegBank},
- // 4: FPR 512-bit value.
+ // 5: FPR 512-bit value.
{0, 512, AArch64::FPRRegBank},
- // 5: GPR 32-bit value.
+ // 6: GPR 32-bit value.
{0, 32, AArch64::GPRRegBank},
- // 6: GPR 64-bit value.
+ // 7: GPR 64-bit value.
{0, 64, AArch64::GPRRegBank},
};
@@ -37,58 +39,77 @@ RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{
{nullptr, 0},
// 3-operands instructions (all binary operations should end up with one of
// those mapping).
- // 1: FPR 32-bit value. <-- This must match First3OpsIdx.
+ // 1: FPR 16-bit value. <-- This must match First3OpsIdx.
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1},
+ // 4: FPR 32-bit value. <-- This must match First3OpsIdx.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
- // 4: FPR 64-bit value.
+ // 7: FPR 64-bit value.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
- // 7: FPR 128-bit value.
+ // 10: FPR 128-bit value.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
- // 10: FPR 256-bit value.
+ // 13: FPR 256-bit value.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
- // 13: FPR 512-bit value.
+ // 16: FPR 512-bit value.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
- // 16: GPR 32-bit value.
+ // 19: GPR 32-bit value.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
- // 19: GPR 64-bit value. <-- This must match Last3OpsIdx.
+ // 22: GPR 64-bit value. <-- This must match Last3OpsIdx.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
// Cross register bank copies.
- // 22: FPR 32-bit value to GPR 32-bit value. <-- This must match
+ // 25: FPR 16-bit value to GPR 16-bit (invalid). <-- This must match
// FirstCrossRegCpyIdx.
+ {nullptr, 1},
+ {nullptr, 1},
+ // 27: FPR 32-bit value to GPR 32-bit value.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
- // 24: FPR 64-bit value to GPR 64-bit value.
+ // 29: FPR 64-bit value to GPR 64-bit value.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
- // 26: FPR 128-bit value to GPR 128-bit value (invalid)
+ // 31: FPR 128-bit value to GPR 128-bit value (invalid)
{nullptr, 1},
{nullptr, 1},
- // 28: FPR 256-bit value to GPR 256-bit value (invalid)
+ // 33: FPR 256-bit value to GPR 256-bit value (invalid)
{nullptr, 1},
{nullptr, 1},
- // 30: FPR 512-bit value to GPR 512-bit value (invalid)
+ // 35: FPR 512-bit value to GPR 512-bit value (invalid)
{nullptr, 1},
{nullptr, 1},
- // 32: GPR 32-bit value to FPR 32-bit value.
+ // 37: GPR 32-bit value to FPR 32-bit value.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
- // 34: GPR 64-bit value to FPR 64-bit value. <-- This must match
+ // 39: GPR 64-bit value to FPR 64-bit value. <-- This must match
// LastCrossRegCpyIdx.
{&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
{&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+ // 41: FPExt: 16 to 32. <-- This must match FPExt16To32Idx.
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1},
+ // 43: FPExt: 16 to 32. <-- This must match FPExt16To64Idx.
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1},
+ // 45: FPExt: 32 to 64. <-- This must match FPExt32To64Idx.
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+ // 47: FPExt vector: 64 to 128. <-- This must match FPExt64To128Idx.
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
};
bool AArch64GenRegisterBankInfo::checkPartialMap(unsigned Idx,
@@ -145,16 +166,18 @@ unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx,
return -1;
}
if (RBIdx == PMI_FirstFPR) {
- if (Size <= 32)
+ if (Size <= 16)
return 0;
- if (Size <= 64)
+ if (Size <= 32)
return 1;
- if (Size <= 128)
+ if (Size <= 64)
return 2;
- if (Size <= 256)
+ if (Size <= 128)
return 3;
- if (Size <= 512)
+ if (Size <= 256)
return 4;
+ if (Size <= 512)
+ return 5;
return -1;
}
return -1;
@@ -206,4 +229,35 @@ AArch64GenRegisterBankInfo::getCopyMapping(unsigned DstBankID,
ValMappingIdx <= LastCrossRegCpyIdx && "Mapping out of bound");
return &ValMappings[ValMappingIdx];
}
+
+const RegisterBankInfo::ValueMapping *
+AArch64GenRegisterBankInfo::getFPExtMapping(unsigned DstSize,
+ unsigned SrcSize) {
+ // We support:
+ // - For Scalar:
+ // - 16 to 32.
+ // - 16 to 64.
+ // - 32 to 64.
+ // => FPR 16 to FPR 32|64
+ // => FPR 32 to FPR 64
+ // - For vectors:
+ // - v4f16 to v4f32
+ // - v2f32 to v2f64
+ // => FPR 64 to FPR 128
+
+ // Check that we have been asked sensible sizes.
+ if (SrcSize == 16) {
+ assert((DstSize == 32 || DstSize == 64) && "Unexpected half extension");
+ if (DstSize == 32)
+ return &ValMappings[FPExt16To32Idx];
+ return &ValMappings[FPExt16To64Idx];
+ }
+
+ if (SrcSize == 32) {
+ assert(DstSize == 64 && "Unexpected float extension");
+ return &ValMappings[FPExt32To64Idx];
+ }
+ assert((SrcSize == 64 || DstSize == 128) && "Unexpected vector extension");
+ return &ValMappings[FPExt64To128Idx];
+}
} // End llvm namespace.
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index bec872ae8c0..81dac1be56c 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -42,6 +42,7 @@
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Constants.h"
@@ -71,7 +72,6 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetCallingConv.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <algorithm>
@@ -4981,7 +4981,7 @@ static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
// the initial estimate is 2^-8. Thus the number of extra steps to refine
// the result for float (23 mantissa bits) is 2 and for double (52
// mantissa bits) is 3.
- ExtraSteps = VT == MVT::f64 ? 3 : 2;
+ ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
}
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 24758e97888..2f10bef1e47 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -17,7 +17,7 @@
#include "AArch64.h"
#include "AArch64RegisterInfo.h"
#include "llvm/CodeGen/MachineCombinerPattern.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "AArch64GenInstrInfo.inc"
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index eabbc05a033..e014d5bd569 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -335,6 +335,7 @@ let RecomputePerFunction = 1 in {
}
include "AArch64InstrFormats.td"
+include "SVEInstrFormats.td"
//===----------------------------------------------------------------------===//
@@ -6275,3 +6276,4 @@ def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
(TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
include "AArch64InstrAtomics.td"
+include "AArch64SVEInstrInfo.td"
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 2d45be37ca7..3a456255224 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -23,6 +23,110 @@
using namespace llvm;
+/// FIXME: The following static functions are SizeChangeStrategy functions
+/// that are meant to temporarily mimic the behaviour of the old legalization
+/// based on doubling/halving non-legal types as closely as possible. This is
+/// not entirly possible as only legalizing the types that are exactly a power
+/// of 2 times the size of the legal types would require specifying all those
+/// sizes explicitly.
+/// In practice, not specifying those isn't a problem, and the below functions
+/// should disappear quickly as we add support for legalizing non-power-of-2
+/// sized types further.
+static void
+addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
+ const LegalizerInfo::SizeAndActionsVec &v) {
+ for (unsigned i = 0; i < v.size(); ++i) {
+ result.push_back(v[i]);
+ if (i + 1 < v[i].first && i + 1 < v.size() &&
+ v[i + 1].first != v[i].first + 1)
+ result.push_back({v[i].first + 1, LegalizerInfo::Unsupported});
+ }
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1_narrow_128_ToLargest(const LegalizerInfo::SizeAndActionsVec &v) {
+ assert(v.size() >= 1);
+ assert(v[0].first > 2);
+ LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::WidenScalar},
+ {2, LegalizerInfo::Unsupported}};
+ addAndInterleaveWithUnsupported(result, v);
+ auto Largest = result.back().first;
+ assert(Largest + 1 < 128);
+ result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+ result.push_back({128, LegalizerInfo::NarrowScalar});
+ result.push_back({129, LegalizerInfo::Unsupported});
+ return result;
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_16(const LegalizerInfo::SizeAndActionsVec &v) {
+ assert(v.size() >= 1);
+ assert(v[0].first > 17);
+ LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::Unsupported},
+ {16, LegalizerInfo::WidenScalar},
+ {17, LegalizerInfo::Unsupported}};
+ addAndInterleaveWithUnsupported(result, v);
+ auto Largest = result.back().first;
+ result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+ return result;
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1_8(const LegalizerInfo::SizeAndActionsVec &v) {
+ assert(v.size() >= 1);
+ assert(v[0].first > 9);
+ LegalizerInfo::SizeAndActionsVec result = {
+ {1, LegalizerInfo::WidenScalar}, {2, LegalizerInfo::Unsupported},
+ {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported}};
+ addAndInterleaveWithUnsupported(result, v);
+ auto Largest = result.back().first;
+ result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+ return result;
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1_8_16(const LegalizerInfo::SizeAndActionsVec &v) {
+ assert(v.size() >= 1);
+ assert(v[0].first > 17);
+ LegalizerInfo::SizeAndActionsVec result = {
+ {1, LegalizerInfo::WidenScalar}, {2, LegalizerInfo::Unsupported},
+ {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported},
+ {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
+ addAndInterleaveWithUnsupported(result, v);
+ auto Largest = result.back().first;
+ result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+ return result;
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1_8_16_narrowToLargest(const LegalizerInfo::SizeAndActionsVec &v) {
+ assert(v.size() >= 1);
+ assert(v[0].first > 17);
+ LegalizerInfo::SizeAndActionsVec result = {
+ {1, LegalizerInfo::WidenScalar}, {2, LegalizerInfo::Unsupported},
+ {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported},
+ {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
+ addAndInterleaveWithUnsupported(result, v);
+ auto Largest = result.back().first;
+ result.push_back({Largest + 1, LegalizerInfo::NarrowScalar});
+ return result;
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1_8_16_32(const LegalizerInfo::SizeAndActionsVec &v) {
+ assert(v.size() >= 1);
+ assert(v[0].first > 33);
+ LegalizerInfo::SizeAndActionsVec result = {
+ {1, LegalizerInfo::WidenScalar}, {2, LegalizerInfo::Unsupported},
+ {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported},
+ {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported},
+ {32, LegalizerInfo::WidenScalar}, {33, LegalizerInfo::Unsupported}};
+ addAndInterleaveWithUnsupported(result, v);
+ auto Largest = result.back().first;
+ result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+ return result;
+}
+
AArch64LegalizerInfo::AArch64LegalizerInfo() {
using namespace TargetOpcode;
const LLT p0 = LLT::pointer(0, 64);
@@ -42,8 +146,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
for (auto Ty : {s16, s32, s64, p0})
setAction({G_PHI, Ty}, Legal);
- for (auto Ty : {s1, s8})
- setAction({G_PHI, Ty}, WidenScalar);
+ setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1_8);
for (auto Ty : { s32, s64 })
setAction({G_BSWAP, Ty}, Legal);
@@ -54,15 +157,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
for (auto Ty : {s32, s64, v2s32, v4s32, v2s64})
setAction({BinOp, Ty}, Legal);
- for (auto Ty : {s1, s8, s16})
- setAction({BinOp, Ty}, WidenScalar);
+ if (BinOp != G_ADD)
+ setLegalizeScalarToDifferentSizeStrategy(BinOp, 0,
+ widen_1_8_16_narrowToLargest);
}
setAction({G_GEP, p0}, Legal);
setAction({G_GEP, 1, s64}, Legal);
- for (auto Ty : {s1, s8, s16, s32})
- setAction({G_GEP, 1, Ty}, WidenScalar);
+ setLegalizeScalarToDifferentSizeStrategy(G_GEP, 1, widen_1_8_16_32);
setAction({G_PTR_MASK, p0}, Legal);
@@ -70,16 +173,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
for (auto Ty : {s32, s64})
setAction({BinOp, Ty}, Legal);
- for (auto Ty : {s1, s8, s16})
- setAction({BinOp, Ty}, WidenScalar);
+ setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1_8_16);
}
for (unsigned BinOp : {G_SREM, G_UREM})
for (auto Ty : { s1, s8, s16, s32, s64 })
setAction({BinOp, Ty}, Lower);
- for (unsigned Op : {G_SMULO, G_UMULO})
- setAction({Op, s64}, Lower);
+ for (unsigned Op : {G_SMULO, G_UMULO}) {
+ setAction({Op, 0, s64}, Lower);
+ setAction({Op, 1, s1}, Legal);
+ }
for (unsigned Op : {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULH, G_UMULH}) {
for (auto Ty : { s32, s64 })
@@ -101,8 +205,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
setAction({G_INSERT, Ty}, Legal);
setAction({G_INSERT, 1, Ty}, Legal);
}
+ setLegalizeScalarToDifferentSizeStrategy(G_INSERT, 0,
+ widen_1_8_16_narrowToLargest);
for (auto Ty : {s1, s8, s16}) {
- setAction({G_INSERT, Ty}, WidenScalar);
setAction({G_INSERT, 1, Ty}, Legal);
// FIXME: Can't widen the sources because that violates the constraints on
// G_INSERT (It seems entirely reasonable that inputs shouldn't overlap).
@@ -118,7 +223,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
for (auto Ty : {s8, s16, s32, s64, p0, v2s32})
setAction({MemOp, Ty}, Legal);
- setAction({MemOp, s1}, WidenScalar);
+ setLegalizeScalarToDifferentSizeStrategy(MemOp, 0,
+ widen_1_narrow_128_ToLargest);
// And everything's fine in addrspace 0.
setAction({MemOp, 1, p0}, Legal);
@@ -132,20 +238,16 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
setAction({G_CONSTANT, p0}, Legal);
- for (auto Ty : {s1, s8, s16})
- setAction({TargetOpcode::G_CONSTANT, Ty}, WidenScalar);
-
- setAction({TargetOpcode::G_FCONSTANT, s16}, WidenScalar);
+ setLegalizeScalarToDifferentSizeStrategy(G_CONSTANT, 0, widen_1_8_16);
+ setLegalizeScalarToDifferentSizeStrategy(G_FCONSTANT, 0, widen_16);
setAction({G_ICMP, 1, s32}, Legal);
setAction({G_ICMP, 1, s64}, Legal);
setAction({G_ICMP, 1, p0}, Legal);
- for (auto Ty : {s1, s8, s16}) {
- setAction({G_ICMP, Ty}, WidenScalar);
- setAction({G_FCMP, Ty}, WidenScalar);
- setAction({G_ICMP, 1, Ty}, WidenScalar);
- }
+ setLegalizeScalarToDifferentSizeStrategy(G_ICMP, 0, widen_1_8_16);
+ setLegalizeScalarToDifferentSizeStrategy(G_FCMP, 0, widen_1_8_16);
+ setLegalizeScalarToDifferentSizeStrategy(G_ICMP, 1, widen_1_8_16);
setAction({G_ICMP, s32}, Legal);
setAction({G_FCMP, s32}, Legal);
@@ -159,12 +261,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
setAction({G_ANYEXT, Ty}, Legal);
}
- for (auto Ty : { s1, s8, s16, s32 }) {
- setAction({G_ZEXT, 1, Ty}, Legal);
- setAction({G_SEXT, 1, Ty}, Legal);
- setAction({G_ANYEXT, 1, Ty}, Legal);
- }
-
// FP conversions
for (auto Ty : { s16, s32 }) {
setAction({G_FPTRUNC, Ty}, Legal);
@@ -176,12 +272,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
setAction({G_FPEXT, Ty}, Legal);
}
- for (auto Ty : { s1, s8, s16, s32 })
- setAction({G_TRUNC, Ty}, Legal);
-
- for (auto Ty : { s8, s16, s32, s64 })
- setAction({G_TRUNC, 1, Ty}, Legal);
-
// Conversions
for (auto Ty : { s32, s64 }) {
setAction({G_FPTOSI, 0, Ty}, Legal);
@@ -189,12 +279,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
setAction({G_SITOFP, 1, Ty}, Legal);
setAction({G_UITOFP, 1, Ty}, Legal);
}
- for (auto Ty : { s1, s8, s16 }) {
- setAction({G_FPTOSI, 0, Ty}, WidenScalar);
- setAction({G_FPTOUI, 0, Ty}, WidenScalar);
- setAction({G_SITOFP, 1, Ty}, WidenScalar);
- setAction({G_UITOFP, 1, Ty}, WidenScalar);
- }
+ setLegalizeScalarToDifferentSizeStrategy(G_FPTOSI, 0, widen_1_8_16);
+ setLegalizeScalarToDifferentSizeStrategy(G_FPTOUI, 0, widen_1_8_16);
+ setLegalizeScalarToDifferentSizeStrategy(G_SITOFP, 1, widen_1_8_16);
+ setLegalizeScalarToDifferentSizeStrategy(G_UITOFP, 1, widen_1_8_16);
for (auto Ty : { s32, s64 }) {
setAction({G_FPTOSI, 1, Ty}, Legal);
@@ -209,8 +297,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
setAction({G_BRINDIRECT, p0}, Legal);
// Select
- for (auto Ty : {s1, s8, s16})
- setAction({G_SELECT, Ty}, WidenScalar);
+ setLegalizeScalarToDifferentSizeStrategy(G_SELECT, 0, widen_1_8_16);
for (auto Ty : {s32, s64, p0})
setAction({G_SELECT, Ty}, Legal);
diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp
index bd5211d4ff5..bd4bdaa6d12 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -15,7 +15,7 @@
#include "AArch64MacroFusion.h"
#include "AArch64Subtarget.h"
#include "llvm/CodeGen/MacroFusion.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 391e8ed633d..83bf493c9f0 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -87,9 +87,9 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR,
{PMI_GPR32, PMI_GPR64}) &&
"PartialMappingIdx's are incorrectly ordered");
- assert(checkPartialMappingIdx(
- PMI_FirstFPR, PMI_LastFPR,
- {PMI_FPR32, PMI_FPR64, PMI_FPR128, PMI_FPR256, PMI_FPR512}) &&
+ assert(checkPartialMappingIdx(PMI_FirstFPR, PMI_LastFPR,
+ {PMI_FPR16, PMI_FPR32, PMI_FPR64, PMI_FPR128,
+ PMI_FPR256, PMI_FPR512}) &&
"PartialMappingIdx's are incorrectly ordered");
// Now, the content.
// Check partial mapping.
@@ -102,6 +102,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR);
CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR);
+ CHECK_PARTIALMAP(PMI_FPR16, 0, 16, RBFPR);
CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR);
CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR);
CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR);
@@ -121,6 +122,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
CHECK_VALUEMAP(GPR, 32);
CHECK_VALUEMAP(GPR, 64);
+ CHECK_VALUEMAP(FPR, 16);
CHECK_VALUEMAP(FPR, 32);
CHECK_VALUEMAP(FPR, 64);
CHECK_VALUEMAP(FPR, 128);
@@ -173,6 +175,30 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64);
CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64);
+#define CHECK_VALUEMAP_FPEXT(DstSize, SrcSize) \
+ do { \
+ unsigned PartialMapDstIdx = PMI_FPR##DstSize - PMI_Min; \
+ unsigned PartialMapSrcIdx = PMI_FPR##SrcSize - PMI_Min; \
+ (void)PartialMapDstIdx; \
+ (void)PartialMapSrcIdx; \
+ const ValueMapping *Map = getFPExtMapping(DstSize, SrcSize); \
+ (void)Map; \
+ assert(Map[0].BreakDown == \
+ &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] && \
+ Map[0].NumBreakDowns == 1 && "FPR" #DstSize \
+ " Dst is incorrectly initialized"); \
+ assert(Map[1].BreakDown == \
+ &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] && \
+ Map[1].NumBreakDowns == 1 && "FPR" #SrcSize \
+ " Src is incorrectly initialized"); \
+ \
+ } while (false)
+
+ CHECK_VALUEMAP_FPEXT(32, 16);
+ CHECK_VALUEMAP_FPEXT(64, 16);
+ CHECK_VALUEMAP_FPEXT(64, 32);
+ CHECK_VALUEMAP_FPEXT(128, 64);
+
assert(verify(TRI) && "Invalid register bank information");
}
@@ -453,6 +479,14 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case TargetOpcode::G_FMUL:
case TargetOpcode::G_FDIV:
return getSameKindOfOperandsMapping(MI);
+ case TargetOpcode::G_FPEXT: {
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+ return getInstructionMapping(
+ DefaultMappingID, /*Cost*/ 1,
+ getFPExtMapping(DstTy.getSizeInBits(), SrcTy.getSizeInBits()),
+ /*NumOperands*/ 2);
+ }
case TargetOpcode::COPY: {
unsigned DstReg = MI.getOperand(0).getReg();
unsigned SrcReg = MI.getOperand(1).getReg();
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.h b/lib/Target/AArch64/AArch64RegisterBankInfo.h
index 6d74a47095a..008221dbef5 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.h
@@ -25,10 +25,10 @@ class TargetRegisterInfo;
class AArch64GenRegisterBankInfo : public RegisterBankInfo {
protected:
-
enum PartialMappingIdx {
PMI_None = -1,
- PMI_FPR32 = 1,
+ PMI_FPR16 = 1,
+ PMI_FPR32,
PMI_FPR64,
PMI_FPR128,
PMI_FPR256,
@@ -37,7 +37,7 @@ protected:
PMI_GPR64,
PMI_FirstGPR = PMI_GPR32,
PMI_LastGPR = PMI_GPR64,
- PMI_FirstFPR = PMI_FPR32,
+ PMI_FirstFPR = PMI_FPR16,
PMI_LastFPR = PMI_FPR512,
PMI_Min = PMI_FirstFPR,
};
@@ -49,11 +49,15 @@ protected:
enum ValueMappingIdx {
InvalidIdx = 0,
First3OpsIdx = 1,
- Last3OpsIdx = 19,
+ Last3OpsIdx = 22,
DistanceBetweenRegBanks = 3,
- FirstCrossRegCpyIdx = 22,
- LastCrossRegCpyIdx = 34,
- DistanceBetweenCrossRegCpy = 2
+ FirstCrossRegCpyIdx = 25,
+ LastCrossRegCpyIdx = 39,
+ DistanceBetweenCrossRegCpy = 2,
+ FPExt16To32Idx = 41,
+ FPExt16To64Idx = 43,
+ FPExt32To64Idx = 45,
+ FPExt64To128Idx = 47,
};
static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx,
@@ -82,6 +86,15 @@ protected:
static const RegisterBankInfo::ValueMapping *
getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size);
+ /// Get the instruction mapping for G_FPEXT.
+ ///
+ /// \pre (DstSize, SrcSize) pair is one of the following:
+ /// (32, 16), (64, 16), (64, 32), (128, 64)
+ ///
+ /// \return An InstructionMapping with statically allocated OperandsMapping.
+ static const RegisterBankInfo::ValueMapping *
+ getFPExtMapping(unsigned DstSize, unsigned SrcSize);
+
#define GET_TARGET_REGBANK_CLASS
#include "AArch64GenRegisterBank.inc"
};
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 91b1481f5ef..1059bc37c8f 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -26,7 +26,7 @@
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index ee5d3547aaa..a9fb0200d80 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -32,6 +32,12 @@ let Namespace = "AArch64" in {
def qsub : SubRegIndex<64>;
def sube64 : SubRegIndex<64>;
def subo64 : SubRegIndex<64>;
+ // SVE
+ def zsub : SubRegIndex<128>;
+ // Note: zsub_hi should never be used directly because it represents
+ // the scalable part of the SVE vector and cannot be manipulated as a
+ // subvector in the same way the lower 128bits can.
+ def zsub_hi : SubRegIndex<128>;
// Note: Code depends on these having consecutive numbers
def dsub0 : SubRegIndex<64>;
def dsub1 : SubRegIndex<64>;
@@ -460,11 +466,11 @@ def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> {
// assmebler matching.
def VectorReg64AsmOperand : AsmOperandClass {
let Name = "VectorReg64";
- let PredicateMethod = "isVectorReg";
+ let PredicateMethod = "isNeonVectorReg";
}
def VectorReg128AsmOperand : AsmOperandClass {
let Name = "VectorReg128";
- let PredicateMethod = "isVectorReg";
+ let PredicateMethod = "isNeonVectorReg";
}
def V64 : RegisterOperand<FPR64, "printVRegOperand"> {
@@ -475,7 +481,10 @@ def V128 : RegisterOperand<FPR128, "printVRegOperand"> {
let ParserMatchClass = VectorReg128AsmOperand;
}
-def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; }
+def VectorRegLoAsmOperand : AsmOperandClass {
+ let Name = "VectorRegLo";
+ let PredicateMethod = "isNeonVectorRegLo";
+}
def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
let ParserMatchClass = VectorRegLoAsmOperand;
}
@@ -642,3 +651,119 @@ def XSeqPairClassOperand :
//===----- END: v8.1a atomic CASP register operands -----------------------===//
+
+// The part of SVE registers that don't overlap Neon registers.
+// These are only used as part of clobber lists.
+def Z0_HI : AArch64Reg<0, "z0_hi">;
+def Z1_HI : AArch64Reg<1, "z1_hi">;
+def Z2_HI : AArch64Reg<2, "z2_hi">;
+def Z3_HI : AArch64Reg<3, "z3_hi">;
+def Z4_HI : AArch64Reg<4, "z4_hi">;
+def Z5_HI : AArch64Reg<5, "z5_hi">;
+def Z6_HI : AArch64Reg<6, "z6_hi">;
+def Z7_HI : AArch64Reg<7, "z7_hi">;
+def Z8_HI : AArch64Reg<8, "z8_hi">;
+def Z9_HI : AArch64Reg<9, "z9_hi">;
+def Z10_HI : AArch64Reg<10, "z10_hi">;
+def Z11_HI : AArch64Reg<11, "z11_hi">;
+def Z12_HI : AArch64Reg<12, "z12_hi">;
+def Z13_HI : AArch64Reg<13, "z13_hi">;
+def Z14_HI : AArch64Reg<14, "z14_hi">;
+def Z15_HI : AArch64Reg<15, "z15_hi">;
+def Z16_HI : AArch64Reg<16, "z16_hi">;
+def Z17_HI : AArch64Reg<17, "z17_hi">;
+def Z18_HI : AArch64Reg<18, "z18_hi">;
+def Z19_HI : AArch64Reg<19, "z19_hi">;
+def Z20_HI : AArch64Reg<20, "z20_hi">;
+def Z21_HI : AArch64Reg<21, "z21_hi">;
+def Z22_HI : AArch64Reg<22, "z22_hi">;
+def Z23_HI : AArch64Reg<23, "z23_hi">;
+def Z24_HI : AArch64Reg<24, "z24_hi">;
+def Z25_HI : AArch64Reg<25, "z25_hi">;
+def Z26_HI : AArch64Reg<26, "z26_hi">;
+def Z27_HI : AArch64Reg<27, "z27_hi">;
+def Z28_HI : AArch64Reg<28, "z28_hi">;
+def Z29_HI : AArch64Reg<29, "z29_hi">;
+def Z30_HI : AArch64Reg<30, "z30_hi">;
+def Z31_HI : AArch64Reg<31, "z31_hi">;
+
+// SVE variable-size vector registers
+let SubRegIndices = [zsub,zsub_hi] in {
+def Z0 : AArch64Reg<0, "z0", [Q0, Z0_HI]>, DwarfRegNum<[96]>;
+def Z1 : AArch64Reg<1, "z1", [Q1, Z1_HI]>, DwarfRegNum<[97]>;
+def Z2 : AArch64Reg<2, "z2", [Q2, Z2_HI]>, DwarfRegNum<[98]>;
+def Z3 : AArch64Reg<3, "z3", [Q3, Z3_HI]>, DwarfRegNum<[99]>;
+def Z4 : AArch64Reg<4, "z4", [Q4, Z4_HI]>, DwarfRegNum<[100]>;
+def Z5 : AArch64Reg<5, "z5", [Q5, Z5_HI]>, DwarfRegNum<[101]>;
+def Z6 : AArch64Reg<6, "z6", [Q6, Z6_HI]>, DwarfRegNum<[102]>;
+def Z7 : AArch64Reg<7, "z7", [Q7, Z7_HI]>, DwarfRegNum<[103]>;
+def Z8 : AArch64Reg<8, "z8", [Q8, Z8_HI]>, DwarfRegNum<[104]>;
+def Z9 : AArch64Reg<9, "z9", [Q9, Z9_HI]>, DwarfRegNum<[105]>;
+def Z10 : AArch64Reg<10, "z10", [Q10, Z10_HI]>, DwarfRegNum<[106]>;
+def Z11 : AArch64Reg<11, "z11", [Q11, Z11_HI]>, DwarfRegNum<[107]>;
+def Z12 : AArch64Reg<12, "z12", [Q12, Z12_HI]>, DwarfRegNum<[108]>;
+def Z13 : AArch64Reg<13, "z13", [Q13, Z13_HI]>, DwarfRegNum<[109]>;
+def Z14 : AArch64Reg<14, "z14", [Q14, Z14_HI]>, DwarfRegNum<[110]>;
+def Z15 : AArch64Reg<15, "z15", [Q15, Z15_HI]>, DwarfRegNum<[111]>;
+def Z16 : AArch64Reg<16, "z16", [Q16, Z16_HI]>, DwarfRegNum<[112]>;
+def Z17 : AArch64Reg<17, "z17", [Q17, Z17_HI]>, DwarfRegNum<[113]>;
+def Z18 : AArch64Reg<18, "z18", [Q18, Z18_HI]>, DwarfRegNum<[114]>;
+def Z19 : AArch64Reg<19, "z19", [Q19, Z19_HI]>, DwarfRegNum<[115]>;
+def Z20 : AArch64Reg<20, "z20", [Q20, Z20_HI]>, DwarfRegNum<[116]>;
+def Z21 : AArch64Reg<21, "z21", [Q21, Z21_HI]>, DwarfRegNum<[117]>;
+def Z22 : AArch64Reg<22, "z22", [Q22, Z22_HI]>, DwarfRegNum<[118]>;
+def Z23 : AArch64Reg<23, "z23", [Q23, Z23_HI]>, DwarfRegNum<[119]>;
+def Z24 : AArch64Reg<24, "z24", [Q24, Z24_HI]>, DwarfRegNum<[120]>;
+def Z25 : AArch64Reg<25, "z25", [Q25, Z25_HI]>, DwarfRegNum<[121]>;
+def Z26 : AArch64Reg<26, "z26", [Q26, Z26_HI]>, DwarfRegNum<[122]>;
+def Z27 : AArch64Reg<27, "z27", [Q27, Z27_HI]>, DwarfRegNum<[123]>;
+def Z28 : AArch64Reg<28, "z28", [Q28, Z28_HI]>, DwarfRegNum<[124]>;
+def Z29 : AArch64Reg<29, "z29", [Q29, Z29_HI]>, DwarfRegNum<[125]>;
+def Z30 : AArch64Reg<30, "z30", [Q30, Z30_HI]>, DwarfRegNum<[126]>;
+def Z31 : AArch64Reg<31, "z31", [Q31, Z31_HI]>, DwarfRegNum<[127]>;
+}
+
+class SVERegOp <string Suffix, AsmOperandClass C,
+ RegisterClass RC> : RegisterOperand<RC> {
+ let PrintMethod = !if(!eq(Suffix, ""),
+ "printSVERegOp<>",
+ "printSVERegOp<'" # Suffix # "'>");
+ let ParserMatchClass = C;
+}
+
+class ZPRRegOp <string Suffix, AsmOperandClass C,
+ RegisterClass RC> : SVERegOp<Suffix, C, RC> {}
+
+//******************************************************************************
+
+// SVE vector register class
+def ZPR : RegisterClass<"AArch64",
+ [nxv16i8, nxv8i16, nxv4i32, nxv2i64,
+ nxv2f16, nxv4f16, nxv8f16,
+ nxv1f32, nxv2f32, nxv4f32,
+ nxv1f64, nxv2f64],
+ 128, (sequence "Z%u", 0, 31)> {
+ let Size = 128;
+}
+
+class ZPRAsmOperand <string name, int Width>: AsmOperandClass {
+ let Name = "SVE" # name # "Reg";
+ let PredicateMethod = "isSVEDataVectorRegOfWidth<" # Width # ">";
+ let RenderMethod = "addRegOperands";
+ let ParserMethod = "tryParseSVEDataVector<"
+ # !if(!eq(Width, -1), "false", "true") # ">";
+}
+
+def ZPRAsmOpAny : ZPRAsmOperand<"VectorAny", -1>;
+def ZPRAsmOp8 : ZPRAsmOperand<"VectorB", 8>;
+def ZPRAsmOp16 : ZPRAsmOperand<"VectorH", 16>;
+def ZPRAsmOp32 : ZPRAsmOperand<"VectorS", 32>;
+def ZPRAsmOp64 : ZPRAsmOperand<"VectorD", 64>;
+def ZPRAsmOp128 : ZPRAsmOperand<"VectorQ", 128>;
+
+def ZPRAny : ZPRRegOp<"", ZPRAsmOpAny, ZPR>;
+def ZPR8 : ZPRRegOp<"b", ZPRAsmOp8, ZPR>;
+def ZPR16 : ZPRRegOp<"h", ZPRAsmOp16, ZPR>;
+def ZPR32 : ZPRRegOp<"s", ZPRAsmOp32, ZPR>;
+def ZPR64 : ZPRRegOp<"d", ZPRAsmOp64, ZPR>;
+def ZPR128 : ZPRRegOp<"q", ZPRAsmOp128, ZPR>;
diff --git a/lib/Target/AArch64/AArch64SVEInstrInfo.td b/lib/Target/AArch64/AArch64SVEInstrInfo.td
new file mode 100644
index 00000000000..7da0b28d22d
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -0,0 +1,17 @@
+//=- AArch64SVEInstrInfo.td - AArch64 SVE Instructions -*- tablegen -*-----=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Scalable Vector Extension (SVE) Instruction definitions.
+//
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSVE] in {
+ defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add">;
+ defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub">;
+}
diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td
index 18d000ace94..90ebd78f4ab 100644
--- a/lib/Target/AArch64/AArch64SchedA53.td
+++ b/lib/Target/AArch64/AArch64SchedA53.td
@@ -26,6 +26,8 @@ def CortexA53Model : SchedMachineModel {
// Specification - Instruction Timings"
// v 1.0 Spreadsheet
let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = [HasSVE];
}
diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
index 5d1608ef04a..ade03f23f8c 100644
--- a/lib/Target/AArch64/AArch64SchedA57.td
+++ b/lib/Target/AArch64/AArch64SchedA57.td
@@ -31,6 +31,8 @@ def CortexA57Model : SchedMachineModel {
// experiments and benchmarking data.
let LoopMicroOpBufferSize = 16;
let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = [HasSVE];
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedCyclone.td b/lib/Target/AArch64/AArch64SchedCyclone.td
index 9fd3ae6818e..7a474ba8ef9 100644
--- a/lib/Target/AArch64/AArch64SchedCyclone.td
+++ b/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -18,6 +18,8 @@ def CycloneModel : SchedMachineModel {
let LoadLatency = 4; // Optimistic load latency.
let MispredictPenalty = 16; // 14-19 cycles are typical.
let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = [HasSVE];
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedFalkor.td b/lib/Target/AArch64/AArch64SchedFalkor.td
index 44fd94fc3d4..7277198b585 100644
--- a/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -23,6 +23,8 @@ def FalkorModel : SchedMachineModel {
let LoadLatency = 3; // Optimistic load latency.
let MispredictPenalty = 11; // Minimum branch misprediction penalty.
let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = [HasSVE];
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedKryo.td b/lib/Target/AArch64/AArch64SchedKryo.td
index 4e491a04c78..ce2afd499af 100644
--- a/lib/Target/AArch64/AArch64SchedKryo.td
+++ b/lib/Target/AArch64/AArch64SchedKryo.td
@@ -27,6 +27,8 @@ def KryoModel : SchedMachineModel {
// experiments and benchmarking data.
let LoopMicroOpBufferSize = 16;
let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = [HasSVE];
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td
index 6133efed020..6c86fcdd29b 100644
--- a/lib/Target/AArch64/AArch64SchedM1.td
+++ b/lib/Target/AArch64/AArch64SchedM1.td
@@ -24,6 +24,8 @@ def ExynosM1Model : SchedMachineModel {
let LoadLatency = 4; // Optimistic load cases.
let MispredictPenalty = 14; // Minimum branch misprediction penalty.
let CompleteModel = 1; // Use the default model otherwise.
+
+ list<Predicate> UnsupportedFeatures = [HasSVE];
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedThunderX.td b/lib/Target/AArch64/AArch64SchedThunderX.td
index 3cdd2047fbb..585688aae27 100644
--- a/lib/Target/AArch64/AArch64SchedThunderX.td
+++ b/lib/Target/AArch64/AArch64SchedThunderX.td
@@ -25,6 +25,8 @@ def ThunderXT8XModel : SchedMachineModel {
let MispredictPenalty = 8; // Branch mispredict penalty.
let PostRAScheduler = 1; // Use PostRA scheduler.
let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = [HasSVE];
}
// Modeling each pipeline with BufferSize == 0 since T8X is in-order.
diff --git a/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index 4ab7555594a..fd60459382a 100644
--- a/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -25,6 +25,8 @@ def ThunderX2T99Model : SchedMachineModel {
let LoopMicroOpBufferSize = 32;
let PostRAScheduler = 1; // Using PostRA sched.
let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = [HasSVE];
}
// Define the issue ports.
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index fe984ccbaf1..78fc322158b 100644
--- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -16,10 +16,10 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/Target/AArch64/AArch64VectorByElementOpt.cpp b/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
index f53af2315ec..2ff644d2bcd 100644
--- a/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
+++ b/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
@@ -33,11 +33,11 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCSchedule.h"
#include "llvm/Pass.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <map>
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 1f06d4065b3..de048a24534 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -59,12 +59,14 @@ using namespace llvm;
namespace {
+enum class RegKind {Scalar, NeonVector, SVEDataVector};
+
class AArch64AsmParser : public MCTargetAsmParser {
private:
StringRef Mnemonic; ///< Instruction mnemonic.
// Map of register aliases registers via the .req directive.
- StringMap<std::pair<bool, unsigned>> RegisterReqs;
+ StringMap<std::pair<RegKind, unsigned>> RegisterReqs;
AArch64TargetStreamer &getTargetStreamer() {
MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
@@ -77,9 +79,10 @@ private:
void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S);
AArch64CC::CondCode parseCondCodeString(StringRef Cond);
bool parseCondCode(OperandVector &Operands, bool invertCondCode);
- unsigned matchRegisterNameAlias(StringRef Name, bool isVector);
+ unsigned matchRegisterNameAlias(StringRef Name, RegKind Kind);
int tryParseRegister();
int tryMatchVectorRegister(StringRef &Kind, bool expected);
+ int tryParseSVEDataVectorRegister(const AsmToken &Tok, StringRef &Kind);
bool parseRegister(OperandVector &Operands);
bool parseSymbolicImmVal(const MCExpr *&ImmVal);
bool parseVectorList(OperandVector &Operands);
@@ -126,8 +129,10 @@ private:
OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
- bool tryParseVectorRegister(OperandVector &Operands);
+ bool tryParseNeonVectorRegister(OperandVector &Operands);
OperandMatchResultTy tryParseGPRSeqPair(OperandVector &Operands);
+ template <bool ParseSuffix>
+ OperandMatchResultTy tryParseSVEDataVector(OperandVector &Operands);
public:
enum AArch64MatchResultTy {
@@ -194,7 +199,9 @@ private:
struct RegOp {
unsigned RegNum;
- bool isVector;
+ RegKind Kind;
+
+ int ElementWidth;
};
struct VectorListOp {
@@ -804,34 +811,50 @@ public:
return SysReg.PStateField != -1U;
}
- bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
- bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
+ bool isReg() const override {
+ return Kind == k_Register && Reg.Kind == RegKind::Scalar;
+ }
+
+ bool isNeonVectorReg() const {
+ return Kind == k_Register && Reg.Kind == RegKind::NeonVector;
+ }
- bool isVectorRegLo() const {
- return Kind == k_Register && Reg.isVector &&
+ bool isNeonVectorRegLo() const {
+ return Kind == k_Register && Reg.Kind == RegKind::NeonVector &&
AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
Reg.RegNum);
}
+ template <unsigned Class = AArch64::ZPRRegClassID>
+ bool isSVEDataVectorReg() const {
+ return (Kind == k_Register && Reg.Kind == RegKind::SVEDataVector) &&
+ AArch64MCRegisterClasses[Class].contains(getReg());
+ }
+
+ template <int ElementWidth> bool isSVEDataVectorRegOfWidth() const {
+ return isSVEDataVectorReg() &&
+ (ElementWidth == -1 || Reg.ElementWidth == ElementWidth);
+ }
+
bool isGPR32as64() const {
- return Kind == k_Register && !Reg.isVector &&
+ return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
}
bool isWSeqPair() const {
- return Kind == k_Register && !Reg.isVector &&
+ return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains(
Reg.RegNum);
}
bool isXSeqPair() const {
- return Kind == k_Register && !Reg.isVector &&
+ return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains(
Reg.RegNum);
}
bool isGPR64sp0() const {
- return Kind == k_Register && !Reg.isVector &&
+ return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum);
}
@@ -1564,10 +1587,22 @@ public:
}
static std::unique_ptr<AArch64Operand>
- CreateReg(unsigned RegNum, bool isVector, SMLoc S, SMLoc E, MCContext &Ctx) {
+ CreateReg(unsigned RegNum, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
Op->Reg.RegNum = RegNum;
- Op->Reg.isVector = isVector;
+ Op->Reg.Kind = Kind;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return Op;
+ }
+
+ static std::unique_ptr<AArch64Operand>
+ CreateReg(unsigned RegNum, RegKind Kind, unsigned ElementWidth,
+ SMLoc S, SMLoc E, MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
+ Op->Reg.RegNum = RegNum;
+ Op->Reg.ElementWidth = ElementWidth;
+ Op->Reg.Kind = Kind;
Op->StartLoc = S;
Op->EndLoc = E;
return Op;
@@ -1791,7 +1826,7 @@ static unsigned MatchRegisterName(StringRef Name);
/// }
-static unsigned matchVectorRegName(StringRef Name) {
+static unsigned MatchNeonVectorRegName(StringRef Name) {
return StringSwitch<unsigned>(Name.lower())
.Case("v0", AArch64::Q0)
.Case("v1", AArch64::Q1)
@@ -1853,6 +1888,57 @@ static bool isValidVectorKind(StringRef Name) {
.Default(false);
}
+static unsigned matchSVEDataVectorRegName(StringRef Name) {
+ return StringSwitch<unsigned>(Name.lower())
+ .Case("z0", AArch64::Z0)
+ .Case("z1", AArch64::Z1)
+ .Case("z2", AArch64::Z2)
+ .Case("z3", AArch64::Z3)
+ .Case("z4", AArch64::Z4)
+ .Case("z5", AArch64::Z5)
+ .Case("z6", AArch64::Z6)
+ .Case("z7", AArch64::Z7)
+ .Case("z8", AArch64::Z8)
+ .Case("z9", AArch64::Z9)
+ .Case("z10", AArch64::Z10)
+ .Case("z11", AArch64::Z11)
+ .Case("z12", AArch64::Z12)
+ .Case("z13", AArch64::Z13)
+ .Case("z14", AArch64::Z14)
+ .Case("z15", AArch64::Z15)
+ .Case("z16", AArch64::Z16)
+ .Case("z17", AArch64::Z17)
+ .Case("z18", AArch64::Z18)
+ .Case("z19", AArch64::Z19)
+ .Case("z20", AArch64::Z20)
+ .Case("z21", AArch64::Z21)
+ .Case("z22", AArch64::Z22)
+ .Case("z23", AArch64::Z23)
+ .Case("z24", AArch64::Z24)
+ .Case("z25", AArch64::Z25)
+ .Case("z26", AArch64::Z26)
+ .Case("z27", AArch64::Z27)
+ .Case("z28", AArch64::Z28)
+ .Case("z29", AArch64::Z29)
+ .Case("z30", AArch64::Z30)
+ .Case("z31", AArch64::Z31)
+ .Default(0);
+}
+
+static bool isValidSVEKind(StringRef Name) {
+ return StringSwitch<bool>(Name.lower())
+ .Case(".b", true)
+ .Case(".h", true)
+ .Case(".s", true)
+ .Case(".d", true)
+ .Case(".q", true)
+ .Default(false);
+}
+
+static bool isSVEDataVectorRegister(StringRef Name) {
+ return Name[0] == 'z';
+}
+
static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
char &ElementKind) {
assert(isValidVectorKind(Name));
@@ -1881,19 +1967,30 @@ bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
// Matches a register name or register alias previously defined by '.req'
unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
- bool isVector) {
- unsigned RegNum = isVector ? matchVectorRegName(Name)
- : MatchRegisterName(Name);
+ RegKind Kind) {
+ unsigned RegNum;
+ switch (Kind) {
+ case RegKind::Scalar:
+ RegNum = MatchRegisterName(Name);
+ break;
+ case RegKind::NeonVector:
+ RegNum = MatchNeonVectorRegName(Name);
+ break;
+ case RegKind::SVEDataVector:
+ RegNum = matchSVEDataVectorRegName(Name);
+ break;
+ }
- if (RegNum == 0) {
+ if (!RegNum) {
// Check for aliases registered via .req. Canonicalize to lower case.
// That's more consistent since register names are case insensitive, and
// it's how the original entry was passed in from MC/MCParser/AsmParser.
auto Entry = RegisterReqs.find(Name.lower());
if (Entry == RegisterReqs.end())
return 0;
+
// set RegNum if the match is the right kind of register
- if (isVector == Entry->getValue().first)
+ if (Kind == Entry->getValue().first)
RegNum = Entry->getValue().second;
}
return RegNum;
@@ -1909,7 +2006,10 @@ int AArch64AsmParser::tryParseRegister() {
return -1;
std::string lowerCase = Tok.getString().lower();
- unsigned RegNum = matchRegisterNameAlias(lowerCase, false);
+ if (isSVEDataVectorRegister(lowerCase))
+ return -1;
+
+ unsigned RegNum = matchRegisterNameAlias(lowerCase, RegKind::Scalar);
// Also handle a few aliases of registers.
if (RegNum == 0)
RegNum = StringSwitch<unsigned>(lowerCase)
@@ -1940,7 +2040,7 @@ int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
// a '.'.
size_t Start = 0, Next = Name.find('.');
StringRef Head = Name.slice(Start, Next);
- unsigned RegNum = matchRegisterNameAlias(Head, true);
+ unsigned RegNum = matchRegisterNameAlias(Head, RegKind::NeonVector);
if (RegNum) {
if (Next != StringRef::npos) {
@@ -2559,8 +2659,8 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
return MatchOperand_Success;
}
-/// tryParseVectorRegister - Parse a vector register operand.
-bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
+/// tryParseNeonVectorRegister - Parse a vector register operand.
+bool AArch64AsmParser::tryParseNeonVectorRegister(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
if (Parser.getTok().isNot(AsmToken::Identifier))
return true;
@@ -2572,7 +2672,9 @@ bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
if (Reg == -1)
return true;
Operands.push_back(
- AArch64Operand::CreateReg(Reg, true, S, getLoc(), getContext()));
+ AArch64Operand::CreateReg(Reg, RegKind::NeonVector, S, getLoc(),
+ getContext()));
+
// If there was an explicit qualifier, that goes on as a literal text
// operand.
if (!Kind.empty())
@@ -2603,19 +2705,48 @@ bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
return false;
}
+// tryParseSVEDataVectorRegister - Try to parse a SVE vector register name with
+// optional kind specifier. If it is a register specifier, eat the token
+// and return it.
+int AArch64AsmParser::tryParseSVEDataVectorRegister(const AsmToken &Tok,
+ StringRef &Kind) {
+ if (Tok.isNot(AsmToken::Identifier))
+ return -1;
+
+ StringRef Name = Tok.getString();
+ // If there is a kind specifier, it's separated from the register name by
+ // a '.'.
+ size_t Start = 0, Next = Name.find('.');
+ StringRef Head = Name.slice(Start, Next);
+ unsigned RegNum = matchRegisterNameAlias(Head, RegKind::SVEDataVector);
+
+ if (RegNum) {
+ if (Next != StringRef::npos) {
+ Kind = Name.slice(Next, StringRef::npos);
+ if (!isValidSVEKind(Kind)) {
+ TokError("invalid sve vector kind qualifier");
+ return -1;
+ }
+ }
+ return RegNum;
+ }
+
+ return -1;
+}
+
/// parseRegister - Parse a non-vector register operand.
bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
SMLoc S = getLoc();
- // Try for a vector register.
- if (!tryParseVectorRegister(Operands))
+ // Try for a vector (neon) register.
+ if (!tryParseNeonVectorRegister(Operands))
return false;
// Try for a scalar register.
int64_t Reg = tryParseRegister();
if (Reg == -1)
return true;
- Operands.push_back(
- AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
+ Operands.push_back(AArch64Operand::CreateReg(Reg, RegKind::Scalar, S,
+ getLoc(), getContext()));
return false;
}
@@ -2783,7 +2914,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
if (!Tok.is(AsmToken::Identifier))
return MatchOperand_NoMatch;
- unsigned RegNum = matchRegisterNameAlias(Tok.getString().lower(), false);
+ unsigned RegNum = matchRegisterNameAlias(Tok.getString().lower(), RegKind::Scalar);
MCContext &Ctx = getContext();
const MCRegisterInfo *RI = Ctx.getRegisterInfo();
@@ -2795,7 +2926,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
if (!parseOptionalToken(AsmToken::Comma)) {
Operands.push_back(
- AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+ AArch64Operand::CreateReg(RegNum, RegKind::Scalar, S, getLoc(), Ctx));
return MatchOperand_Success;
}
@@ -2814,7 +2945,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
}
Operands.push_back(
- AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+ AArch64Operand::CreateReg(RegNum, RegKind::Scalar, S, getLoc(), Ctx));
return MatchOperand_Success;
}
@@ -3529,8 +3660,8 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
Operands[0] = AArch64Operand::CreateToken(
"bfm", false, Op.getStartLoc(), getContext());
Operands[2] = AArch64Operand::CreateReg(
- RegWidth == 32 ? AArch64::WZR : AArch64::XZR, false, SMLoc(),
- SMLoc(), getContext());
+ RegWidth == 32 ? AArch64::WZR : AArch64::XZR, RegKind::Scalar,
+ SMLoc(), SMLoc(), getContext());
Operands[3] = AArch64Operand::CreateImm(
ImmRExpr, LSBOp.getStartLoc(), LSBOp.getEndLoc(), getContext());
Operands.emplace_back(
@@ -3666,8 +3797,9 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
if (Op.isReg()) {
unsigned Reg = getXRegFromWReg(Op.getReg());
- Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
- Op.getEndLoc(), getContext());
+ Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar,
+ Op.getStartLoc(), Op.getEndLoc(),
+ getContext());
}
}
// FIXME: Likewise for sxt[bh] with a Xd dst operand
@@ -3681,7 +3813,8 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
if (Op.isReg()) {
unsigned Reg = getXRegFromWReg(Op.getReg());
- Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+ Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar,
+ Op.getStartLoc(),
Op.getEndLoc(), getContext());
}
}
@@ -3697,7 +3830,8 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
if (Op.isReg()) {
unsigned Reg = getWRegFromXReg(Op.getReg());
- Operands[1] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+ Operands[1] = AArch64Operand::CreateReg(Reg, RegKind::Scalar,
+ Op.getStartLoc(),
Op.getEndLoc(), getContext());
}
}
@@ -4158,14 +4292,25 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
Parser.Lex(); // Eat the '.req' token.
SMLoc SRegLoc = getLoc();
unsigned RegNum = tryParseRegister();
- bool IsVector = false;
+ RegKind RegisterKind = RegKind::Scalar;
if (RegNum == static_cast<unsigned>(-1)) {
StringRef Kind;
+ RegisterKind = RegKind::NeonVector;
RegNum = tryMatchVectorRegister(Kind, false);
if (!Kind.empty())
return Error(SRegLoc, "vector register without type specifier expected");
- IsVector = true;
+ }
+
+ if (RegNum == static_cast<unsigned>(-1)) {
+ StringRef Kind;
+ RegisterKind = RegKind::SVEDataVector;
+ int RegNumTmp = tryParseSVEDataVectorRegister(Parser.getTok(), Kind);
+ if (RegNumTmp != -1)
+ Parser.Lex();
+ RegNum = RegNumTmp;
+ if (!Kind.empty())
+ return Error(SRegLoc, "sve vector register without type specifier expected");
}
if (RegNum == static_cast<unsigned>(-1))
@@ -4176,7 +4321,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
"unexpected input in .req directive"))
return true;
- auto pair = std::make_pair(IsVector, RegNum);
+ auto pair = std::make_pair(RegisterKind, RegNum);
if (RegisterReqs.insert(std::make_pair(Name, pair)).first->second != pair)
Warning(L, "ignoring redefinition of register alias '" + Name + "'");
@@ -4388,8 +4533,43 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
&AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID]);
}
- Operands.push_back(AArch64Operand::CreateReg(Pair, false, S, getLoc(),
- getContext()));
+ Operands.push_back(AArch64Operand::CreateReg(Pair, RegKind::Scalar, S,
+ getLoc(), getContext()));
+
+ return MatchOperand_Success;
+}
+
+template <bool ParseSuffix>
+OperandMatchResultTy
+AArch64AsmParser::tryParseSVEDataVector(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ const SMLoc S = getLoc();
+ // Check for a SVE vector register specifier first.
+ StringRef Kind;
+ int RegNum = tryParseSVEDataVectorRegister(Parser.getTok(), Kind);
+ if (RegNum == -1)
+ return MatchOperand_NoMatch;
+
+ // Eat the SVE Register Token
+ Parser.Lex();
+
+ if (ParseSuffix && Kind.empty())
+ return MatchOperand_NoMatch;
+
+ unsigned ElementWidth = StringSwitch<unsigned>(Kind.lower())
+ .Case("", -1)
+ .Case(".b", 8)
+ .Case(".h", 16)
+ .Case(".s", 32)
+ .Case(".d", 64)
+ .Case(".q", 128)
+ .Default(0);
+ if (!ElementWidth)
+ return MatchOperand_NoMatch;
+
+ Operands.push_back(
+ AArch64Operand::CreateReg(RegNum, RegKind::SVEDataVector, ElementWidth,
+ S, S, getContext()));
return MatchOperand_Success;
}
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 73db32c0487..aea1b4f2d2c 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -85,6 +85,9 @@ static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decode);
static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
uint64_t Address,
@@ -436,6 +439,27 @@ static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
Inst.addOperand(MCOperand::createReg(Register));
return Success;
}
+static const unsigned ZPRDecoderTable[] = {
+ AArch64::Z0, AArch64::Z1, AArch64::Z2, AArch64::Z3,
+ AArch64::Z4, AArch64::Z5, AArch64::Z6, AArch64::Z7,
+ AArch64::Z8, AArch64::Z9, AArch64::Z10, AArch64::Z11,
+ AArch64::Z12, AArch64::Z13, AArch64::Z14, AArch64::Z15,
+ AArch64::Z16, AArch64::Z17, AArch64::Z18, AArch64::Z19,
+ AArch64::Z20, AArch64::Z21, AArch64::Z22, AArch64::Z23,
+ AArch64::Z24, AArch64::Z25, AArch64::Z26, AArch64::Z27,
+ AArch64::Z28, AArch64::Z29, AArch64::Z30, AArch64::Z31
+};
+
+static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void* Decoder) {
+ if (RegNo > 31)
+ return Fail;
+
+ unsigned Register = ZPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
static const unsigned VectorDecoderTable[] = {
AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4,
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index 62e5d02f603..bdf71b095fd 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -1340,3 +1340,23 @@ void AArch64InstPrinter::printComplexRotationOp(const MCInst *MI, unsigned OpNo,
O << "#" << (Val * Angle) + Remainder;
}
+template <char suffix>
+void AArch64InstPrinter::printSVERegOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ switch (suffix) {
+ case 0:
+ case 'b':
+ case 'h':
+ case 's':
+ case 'd':
+ case 'q':
+ break;
+ default: llvm_unreachable("Invalid kind specifier.");
+ }
+
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+ O << getRegisterName(Reg);
+ if (suffix != 0)
+ O << '.' << suffix;
+} \ No newline at end of file
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 8515ad24c71..76f20f042ce 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -165,6 +165,9 @@ protected:
void printGPRSeqPairsClassOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O);
+ template <char = 0>
+ void printSVERegOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
};
class AArch64AppleInstPrinter : public AArch64InstPrinter {
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 7fba4849438..c5da457c38f 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -106,13 +106,15 @@ AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() {
PrivateLabelPrefix = ".L";
AlignmentIsInBytes = false;
SupportsDebugInformation = true;
- ExceptionsType = ExceptionHandling::WinEH;
+ CodePointerSize = 8;
}
AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
CommentString = ";";
+ ExceptionsType = ExceptionHandling::WinEH;
}
AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
CommentString = "//";
+ ExceptionsType = ExceptionHandling::DwarfCFI;
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index 9d0f39e5f6a..c88363d2c25 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -23,7 +23,15 @@ public:
std::unique_ptr<MCCodeEmitter> CE,
raw_pwrite_stream &OS)
: MCWinCOFFStreamer(C, std::move(AB), std::move(CE), OS) {}
+
+ void FinishImpl() override;
};
+
+void AArch64WinCOFFStreamer::FinishImpl() {
+ EmitFrames(nullptr);
+
+ MCWinCOFFStreamer::FinishImpl();
+}
} // end anonymous namespace
namespace llvm {
diff --git a/lib/Target/AArch64/SVEInstrFormats.td b/lib/Target/AArch64/SVEInstrFormats.td
new file mode 100644
index 00000000000..e74bab8b7fe
--- /dev/null
+++ b/lib/Target/AArch64/SVEInstrFormats.td
@@ -0,0 +1,41 @@
+//=-- SVEInstrFormats.td - AArch64 SVE Instruction classes -*- tablegen -*--=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Scalable Vector Extension (SVE) Instruction Class Definitions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Arithmetic - Unpredicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_bin_cons_arit_0<bits<2> sz8_64, bits<3> opc, string asm,
+ ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
+ asm, "\t$Zd, $Zn, $Zm",
+ "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zm;
+ bits<5> Zn;
+ let Inst{31-24} = 0b00000100;
+ let Inst{23-22} = sz8_64;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-13} = 0b000;
+ let Inst{12-10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+}
+
+multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm> {
+ def _B : sve_int_bin_cons_arit_0<0b00, opc, asm, ZPR8>;
+ def _H : sve_int_bin_cons_arit_0<0b01, opc, asm, ZPR16>;
+ def _S : sve_int_bin_cons_arit_0<0b10, opc, asm, ZPR32>;
+ def _D : sve_int_bin_cons_arit_0<0b11, opc, asm, ZPR64>;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 156f7bc6512..b17b6716766 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -400,7 +400,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
return false;
FastMathFlags FMF = FPOp->getFastMathFlags();
- bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
+ bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
FMF.allowReciprocal();
// With UnsafeDiv node will be optimized to just rcp and mul.
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 2329fffd521..91fe921bfee 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -15,7 +15,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c313e4a04ef..f04efd71fa0 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -204,6 +204,7 @@ private:
void SelectADD_SUB_I64(SDNode *N);
void SelectUADDO_USUBO(SDNode *N);
void SelectDIV_SCALE(SDNode *N);
+ void SelectMAD_64_32(SDNode *N);
void SelectFMA_W_CHAIN(SDNode *N);
void SelectFMUL_W_CHAIN(SDNode *N);
@@ -594,6 +595,11 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectDIV_SCALE(N);
return;
}
+ case AMDGPUISD::MAD_I64_I32:
+ case AMDGPUISD::MAD_U64_U32: {
+ SelectMAD_64_32(N);
+ return;
+ }
case ISD::CopyToReg: {
const SITargetLowering& Lowering =
*static_cast<const SITargetLowering*>(getTargetLowering());
@@ -814,6 +820,19 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
+void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
+ SDLoc SL(N);
+ bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
+ unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32;
+
+ SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+ Clamp };
+ CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+}
+
bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
unsigned OffsetBits) const {
if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index fe2c9337721..d502b77447d 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -128,27 +128,20 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
}
-bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op)
-{
- assert(Op.getOpcode() == ISD::OR);
-
- SDValue N0 = Op->getOperand(0);
- SDValue N1 = Op->getOperand(1);
- EVT VT = N0.getValueType();
-
- if (VT.isInteger() && !VT.isVector()) {
- KnownBits LHSKnown, RHSKnown;
- DAG.computeKnownBits(N0, LHSKnown);
+unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
+ KnownBits Known;
+ EVT VT = Op.getValueType();
+ DAG.computeKnownBits(Op, Known);
- if (LHSKnown.Zero.getBoolValue()) {
- DAG.computeKnownBits(N1, RHSKnown);
+ return VT.getSizeInBits() - Known.countMinLeadingZeros();
+}
- if (!(~RHSKnown.Zero & ~LHSKnown.Zero))
- return true;
- }
- }
+unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
- return false;
+ // In order for this to be a signed 24-bit value, bit 23, must
+ // be a sign bit.
+ return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
}
AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
@@ -2615,21 +2608,14 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
//===----------------------------------------------------------------------===//
static bool isU24(SDValue Op, SelectionDAG &DAG) {
- KnownBits Known;
- EVT VT = Op.getValueType();
- DAG.computeKnownBits(Op, Known);
-
- return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24;
+ return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
}
static bool isI24(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
-
- // In order for this to be a signed 24-bit value, bit 23, must
- // be a sign bit.
return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
// as unsigned 24-bit values.
- (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
+ AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
}
static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
@@ -2914,21 +2900,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
return DAG.getZExtOrTrunc(Shl, SL, VT);
}
- case ISD::OR:
- if (!isOrEquivalentToAdd(DAG, LHS))
- break;
- LLVM_FALLTHROUGH;
- case ISD::ADD: {
- // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1)
- if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
- SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0),
- SDValue(RHS, 0));
- SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal,
- SDLoc(C2), VT);
- return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V);
- }
- break;
- }
}
if (VT != MVT::i64)
@@ -3946,6 +3917,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(MUL_LOHI_I24)
NODE_NAME_CASE(MAD_U24)
NODE_NAME_CASE(MAD_I24)
+ NODE_NAME_CASE(MAD_I64_I32)
+ NODE_NAME_CASE(MAD_U64_U32)
NODE_NAME_CASE(TEXTURE_FETCH)
NODE_NAME_CASE(EXPORT)
NODE_NAME_CASE(EXPORT_DONE)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index cdb15186f86..ba35aeb90ed 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -35,7 +35,8 @@ private:
SDValue getFFBX_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, unsigned Opc) const;
public:
- static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op);
+ static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG);
+ static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
protected:
const AMDGPUSubtarget *Subtarget;
@@ -379,6 +380,8 @@ enum NodeType : unsigned {
MULHI_I24,
MAD_U24,
MAD_I24,
+ MAD_U64_U32,
+ MAD_I64_I32,
MUL_LOHI_I24,
MUL_LOHI_U24,
TEXTURE_FETCH,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 41cc7d7093e..f1a42b42f1f 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -18,7 +18,7 @@
#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "AMDGPUGenInstrInfo.inc"
diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index e7e54750fe6..714c60a7446 100644
--- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -487,7 +487,7 @@ bool AMDGPULibCalls::parseFunctionName(const StringRef& FMangledName,
bool AMDGPULibCalls::isUnsafeMath(const CallInst *CI) const {
if (auto Op = dyn_cast<FPMathOperator>(CI))
- if (Op->hasUnsafeAlgebra())
+ if (Op->isFast())
return true;
const Function *F = CI->getParent()->getParent();
Attribute Attr = F->getFnAttribute("unsafe-fp-math");
@@ -1337,7 +1337,8 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
// for OpenCL 2.0 we have only generic implementation of sincos
// function.
AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
- nf.getLeads()[0].PtrKind = AMDGPULibFunc::GENERIC;
+ const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*M);
+ nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AS.FLAT_ADDRESS);
Function *Fsincos = dyn_cast_or_null<Function>(getFunction(M, nf));
if (!Fsincos) return false;
@@ -1350,7 +1351,6 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
// The allocaInst allocates the memory in private address space. This need
// to be bitcasted to point to the address space of cos pointer type.
// In OpenCL 2.0 this is generic, while in 1.2 that is private.
- const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*M);
if (PTy->getPointerAddressSpace() != AS.PRIVATE_ADDRESS)
P = B.CreateAddrSpaceCast(Alloc, PTy);
CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 919e8c1e13c..4671273d61f 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -11,6 +11,7 @@
//
//===----------------------------------------------------------------------===//
+#include "AMDGPU.h"
#include "AMDGPULibFunc.h"
#include <llvm/ADT/SmallString.h>
#include <llvm/ADT/SmallVector.h>
@@ -458,13 +459,16 @@ AMDGPULibFunc::Param ParamIterator::getNextParam() {
P.ArgType = AMDGPULibFunc::I32;
break;
- case E_CONSTPTR_SWAPGL:
- switch (P.PtrKind & AMDGPULibFunc::ADDR_SPACE) {
- case AMDGPULibFunc::GLOBAL: P.PtrKind = AMDGPULibFunc::LOCAL; break;
- case AMDGPULibFunc::LOCAL: P.PtrKind = AMDGPULibFunc::GLOBAL; break;
+ case E_CONSTPTR_SWAPGL: {
+ unsigned AS = AMDGPULibFunc::getAddrSpaceFromEPtrKind(P.PtrKind);
+ switch (AS) {
+ case AMDGPUAS::GLOBAL_ADDRESS: AS = AMDGPUAS::LOCAL_ADDRESS; break;
+ case AMDGPUAS::LOCAL_ADDRESS: AS = AMDGPUAS::GLOBAL_ADDRESS; break;
}
+ P.PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AS);
P.PtrKind |= AMDGPULibFunc::CONST;
break;
+ }
default: llvm_unreachable("Unhandeled param rule");
}
@@ -590,19 +594,14 @@ bool ItaniumParamParser::parseItaniumParam(StringRef& param,
if (eatTerm(param, 'P')) {
if (eatTerm(param, 'K')) res.PtrKind |= AMDGPULibFunc::CONST;
if (eatTerm(param, 'V')) res.PtrKind |= AMDGPULibFunc::VOLATILE;
+ unsigned AS;
if (!eatTerm(param, "U3AS")) {
- res.PtrKind |= AMDGPULibFunc::PRIVATE;
+ AS = 0;
} else {
- switch(param.front()) {
- case '1': res.PtrKind |= AMDGPULibFunc::GLOBAL; break;
- case '2': res.PtrKind |= AMDGPULibFunc::READONLY;break;
- case '3': res.PtrKind |= AMDGPULibFunc::LOCAL; break;
- case '4': res.PtrKind |= AMDGPULibFunc::GENERIC; break;
- case '5': res.PtrKind |= AMDGPULibFunc::OTHER; break;
- default: return false;
- }
+ AS = param.front() - '0';
drop_front(param, 1);
}
+ res.PtrKind |= AMDGPULibFuncBase::getEPtrKindFromAddrSpace(AS);
} else {
res.PtrKind = AMDGPULibFunc::BYVALUE;
}
@@ -837,7 +836,9 @@ public:
os << 'P';
if (p.PtrKind & AMDGPULibFunc::CONST) os << 'K';
if (p.PtrKind & AMDGPULibFunc::VOLATILE) os << 'V';
- int AS = UseAddrSpace ? (p.PtrKind & AMDGPULibFunc::ADDR_SPACE)-1 : 0;
+ unsigned AS = UseAddrSpace
+ ? AMDGPULibFuncBase::getAddrSpaceFromEPtrKind(p.PtrKind)
+ : 0;
if (AS != 0) os << "U3AS" << AS;
Ptr = p;
p.PtrKind = 0;
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.h b/lib/Target/AMDGPU/AMDGPULibFunc.h
index 8f4297b4a49..5405bc64571 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -283,14 +283,7 @@ public:
enum EPtrKind {
BYVALUE = 0,
- PRIVATE,
- GLOBAL,
- READONLY,
- LOCAL,
- GENERIC,
- OTHER,
-
- ADDR_SPACE = 0xF,
+ ADDR_SPACE = 0xF, // Address space takes value 0x1 ~ 0xF.
CONST = 0x10,
VOLATILE = 0x20
};
@@ -315,6 +308,17 @@ public:
static bool isMangled(EFuncId Id) {
return static_cast<unsigned>(Id) <= static_cast<unsigned>(EI_LAST_MANGLED);
}
+
+ static unsigned getEPtrKindFromAddrSpace(unsigned AS) {
+ assert(((AS + 1) & ~ADDR_SPACE) == 0);
+ return AS + 1;
+ }
+
+ static unsigned getAddrSpaceFromEPtrKind(unsigned Kind) {
+ Kind = Kind & ADDR_SPACE;
+ assert(Kind >= 1);
+ return Kind - 1;
+ }
};
class AMDGPULibFuncImpl : public AMDGPULibFuncBase {
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 9fc9592bdc5..83122281d2b 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -23,7 +23,7 @@
#include "llvm/ADT/SmallString.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/IR/MDBuilder.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include <algorithm>
using namespace llvm;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 56a5fa634b5..6ee529c8549 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -462,6 +462,10 @@ public:
return isAmdHsaOS() || isMesaKernel(MF);
}
+ bool hasMad64_32() const {
+ return getGeneration() >= SEA_ISLANDS;
+ }
+
bool hasFminFmaxLegacy() const {
return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f7ecdea7704..14f26f787ab 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -245,6 +245,10 @@ GCNMinRegSchedRegistry("gcn-minreg",
static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.
+ if (TT.getEnvironmentName() == "amdgiz" ||
+ TT.getEnvironmentName() == "amdgizcl")
+ return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
}
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index d729dcc439e..d1120f5e330 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2134,12 +2134,17 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
- // FIXME: Remove this hack for function pointer types.
- const GlobalValue *GV = GA->getGlobal();
- assert(Callee.getValueType() == MVT::i32);
- Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(),
- false, GA->getTargetFlags());
+ // FIXME: Remove this hack for function pointer types after removing
+ // support of old address space mapping. In the new address space
+ // mapping the pointer in default address space is 64 bit, therefore
+ // does not need this hack.
+ if (Callee.getValueType() == MVT::i32) {
+ const GlobalValue *GV = GA->getGlobal();
+ Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false,
+ GA->getTargetFlags());
+ }
}
+ assert(Callee.getValueType() == MVT::i64);
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
@@ -5957,18 +5962,57 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
return 0;
}
+static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
+ EVT VT,
+ SDValue N0, SDValue N1, SDValue N2,
+ bool Signed) {
+ unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
+ SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
+ SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
+}
+
SDValue SITargetLowering::performAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
-
- if (VT != MVT::i32)
- return SDValue();
-
SDLoc SL(N);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
+ if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
+ && Subtarget->hasMad64_32() &&
+ !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
+ VT.getScalarSizeInBits() <= 64) {
+ if (LHS.getOpcode() != ISD::MUL)
+ std::swap(LHS, RHS);
+
+ SDValue MulLHS = LHS.getOperand(0);
+ SDValue MulRHS = LHS.getOperand(1);
+ SDValue AddRHS = RHS;
+
+ // TODO: Maybe restrict if SGPR inputs.
+ if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
+ numBitsUnsigned(MulRHS, DAG) <= 32) {
+ MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
+ MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
+ AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
+ return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
+ }
+
+ if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
+ MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
+ MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
+ AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
+ return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
+ }
+
+ return SDValue();
+ }
+
+ if (VT != MVT::i32)
+ return SDValue();
+
// add x, zext (setcc) => addcarry x, 0, setcc
// add x, sext (setcc) => subcarry x, 0, setcc
unsigned Opc = LHS.getOpcode();
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ade909cc84e..5dde72910ee 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -14,15 +14,15 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
#define LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
-#include "AMDGPUMachineFunction.h"
#include "AMDGPUArgumentUsageInfo.h"
+#include "AMDGPUMachineFunction.h"
#include "SIRegisterInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/ErrorHandling.h"
#include <array>
@@ -87,9 +87,6 @@ public:
/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
/// tells the hardware which interpolation parameters to load.
class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
- // FIXME: This should be removed and getPreloadedValue moved here.
- friend class SIRegisterInfo;
-
unsigned TIDReg = AMDGPU::NoRegister;
// Registers that may be reserved for spilling purposes. These may be the same
@@ -143,7 +140,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
private:
unsigned LDSWaveSpillSize = 0;
- unsigned ScratchOffsetReg;
unsigned NumUserSGPRs = 0;
unsigned NumSystemSGPRs = 0;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index aa041aab51c..666b80107dc 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -399,8 +399,10 @@ def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_
} // End Constraints = "@earlyclobber $vdst"
let isCommutable = 1 in {
+let SchedRW = [WriteDouble, WriteSALU] in {
def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
+} // End SchedRW = [WriteDouble, WriteSALU]
} // End isCommutable = 1
} // End SubtargetPredicate = isCIVI
diff --git a/lib/Target/ARC/ARCBranchFinalize.cpp b/lib/Target/ARC/ARCBranchFinalize.cpp
index 0fb8a420d86..e5b0f8f3208 100644
--- a/lib/Target/ARC/ARCBranchFinalize.cpp
+++ b/lib/Target/ARC/ARCBranchFinalize.cpp
@@ -20,8 +20,8 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include <vector>
using namespace llvm;
diff --git a/lib/Target/ARC/ARCFrameLowering.h b/lib/Target/ARC/ARCFrameLowering.h
index ac5378adbd8..c042bec016c 100644
--- a/lib/Target/ARC/ARCFrameLowering.h
+++ b/lib/Target/ARC/ARCFrameLowering.h
@@ -17,7 +17,7 @@
#include "ARC.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
diff --git a/lib/Target/ARC/ARCInstrInfo.h b/lib/Target/ARC/ARCInstrInfo.h
index 5285dce9f12..f965dd4ff7f 100644
--- a/lib/Target/ARC/ARCInstrInfo.h
+++ b/lib/Target/ARC/ARCInstrInfo.h
@@ -15,7 +15,7 @@
#define LLVM_LIB_TARGET_ARC_ARCINSTRINFO_H
#include "ARCRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "ARCGenInstrInfo.inc"
diff --git a/lib/Target/ARC/ARCRegisterInfo.cpp b/lib/Target/ARC/ARCRegisterInfo.cpp
index 66f95911d3e..bed47a0eab5 100644
--- a/lib/Target/ARC/ARCRegisterInfo.cpp
+++ b/lib/Target/ARC/ARCRegisterInfo.cpp
@@ -25,7 +25,7 @@
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 3688db943d5..dac11626a6f 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -37,6 +37,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Constants.h"
@@ -53,7 +54,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include <algorithm>
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 9f168acd567..99bcf788ddf 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -21,7 +21,7 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include <array>
#include <cstdint>
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 63b14ee98d7..eaa8d4c0f1a 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -31,6 +31,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Constants.h"
@@ -41,7 +42,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index e1323cd9427..9c10a1c79a4 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -417,6 +417,12 @@ struct FormalArgHandler : public IncomingValueHandler {
bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
const Function &F,
ArrayRef<unsigned> VRegs) const {
+ auto &TLI = *getTLI<ARMTargetLowering>();
+ auto Subtarget = TLI.getSubtarget();
+
+ if (Subtarget->isThumb())
+ return false;
+
// Quick exit if there aren't any args
if (F.arg_empty())
return true;
@@ -427,12 +433,6 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
auto &MF = MIRBuilder.getMF();
auto &MBB = MIRBuilder.getMBB();
auto DL = MF.getDataLayout();
- auto &TLI = *getTLI<ARMTargetLowering>();
-
- auto Subtarget = TLI.getSubtarget();
-
- if (Subtarget->isThumb())
- return false;
for (auto &Arg : F.args())
if (!isSupportedType(DL, TLI, Arg.getType()))
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index 71b81936240..284b67fd59b 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -19,8 +19,8 @@
#include "ARMBaseInstrInfo.h"
#include "ARMSubtarget.h"
#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/CallingConv.h"
-#include "llvm/Target/TargetInstrInfo.h"
namespace llvm {
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 5c967d9f906..f42d00ecf60 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -43,6 +43,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
@@ -72,7 +73,6 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOpcodes.h"
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index ce4add974d6..ab8fbb47086 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -34,6 +34,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DebugLoc.h"
@@ -49,7 +50,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index 2c10031e3f8..1f18e2bf80c 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -11,7 +11,7 @@
#define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include <vector>
namespace llvm {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 9cfa0d3a7c3..deece84ecf2 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -57,6 +57,7 @@
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallingConv.h"
@@ -94,7 +95,6 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOpcodes.h"
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 309430b0e9c..34186dede0d 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -24,6 +24,54 @@
using namespace llvm;
+/// FIXME: The following static functions are SizeChangeStrategy functions
+/// that are meant to temporarily mimic the behaviour of the old legalization
+/// based on doubling/halving non-legal types as closely as possible. This is
+/// not entirly possible as only legalizing the types that are exactly a power
+/// of 2 times the size of the legal types would require specifying all those
+/// sizes explicitly.
+/// In practice, not specifying those isn't a problem, and the below functions
+/// should disappear quickly as we add support for legalizing non-power-of-2
+/// sized types further.
+static void
+addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
+ const LegalizerInfo::SizeAndActionsVec &v) {
+ for (unsigned i = 0; i < v.size(); ++i) {
+ result.push_back(v[i]);
+ if (i + 1 < v[i].first && i + 1 < v.size() &&
+ v[i + 1].first != v[i].first + 1)
+ result.push_back({v[i].first + 1, LegalizerInfo::Unsupported});
+ }
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_8_16(const LegalizerInfo::SizeAndActionsVec &v) {
+ assert(v.size() >= 1);
+ assert(v[0].first > 17);
+ LegalizerInfo::SizeAndActionsVec result = {
+ {1, LegalizerInfo::Unsupported},
+ {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported},
+ {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
+ addAndInterleaveWithUnsupported(result, v);
+ auto Largest = result.back().first;
+ result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+ return result;
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1_8_16(const LegalizerInfo::SizeAndActionsVec &v) {
+ assert(v.size() >= 1);
+ assert(v[0].first > 17);
+ LegalizerInfo::SizeAndActionsVec result = {
+ {1, LegalizerInfo::WidenScalar}, {2, LegalizerInfo::Unsupported},
+ {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported},
+ {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
+ addAndInterleaveWithUnsupported(result, v);
+ auto Largest = result.back().first;
+ result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+ return result;
+}
+
static bool AEABI(const ARMSubtarget &ST) {
return ST.isTargetAEABI() || ST.isTargetGNUAEABI() || ST.isTargetMuslAEABI();
}
@@ -49,14 +97,15 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
}
for (unsigned Op : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) {
- for (auto Ty : {s1, s8, s16})
- setAction({Op, Ty}, WidenScalar);
+ if (Op != G_ADD)
+ setLegalizeScalarToDifferentSizeStrategy(
+ Op, 0, widenToLargerTypesUnsupportedOtherwise);
setAction({Op, s32}, Legal);
}
for (unsigned Op : {G_SDIV, G_UDIV}) {
- for (auto Ty : {s8, s16})
- setAction({Op, Ty}, WidenScalar);
+ setLegalizeScalarToDifferentSizeStrategy(Op, 0,
+ widenToLargerTypesUnsupportedOtherwise);
if (ST.hasDivideInARMMode())
setAction({Op, s32}, Legal);
else
@@ -64,8 +113,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
}
for (unsigned Op : {G_SREM, G_UREM}) {
- for (auto Ty : {s8, s16})
- setAction({Op, Ty}, WidenScalar);
+ setLegalizeScalarToDifferentSizeStrategy(Op, 0, widen_8_16);
if (ST.hasDivideInARMMode())
setAction({Op, s32}, Lower);
else if (AEABI(ST))
@@ -74,10 +122,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
setAction({Op, s32}, Libcall);
}
- for (unsigned Op : {G_SEXT, G_ZEXT}) {
+ for (unsigned Op : {G_SEXT, G_ZEXT, G_ANYEXT}) {
setAction({Op, s32}, Legal);
- for (auto Ty : {s1, s8, s16})
- setAction({Op, 1, Ty}, Legal);
}
for (unsigned Op : {G_ASHR, G_LSHR, G_SHL})
@@ -93,12 +139,11 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
setAction({G_BRCOND, s1}, Legal);
setAction({G_CONSTANT, s32}, Legal);
- for (auto Ty : {s1, s8, s16})
- setAction({G_CONSTANT, Ty}, WidenScalar);
+ setLegalizeScalarToDifferentSizeStrategy(G_CONSTANT, 0, widen_1_8_16);
setAction({G_ICMP, s1}, Legal);
- for (auto Ty : {s8, s16})
- setAction({G_ICMP, 1, Ty}, WidenScalar);
+ setLegalizeScalarToDifferentSizeStrategy(G_ICMP, 1,
+ widenToLargerTypesUnsupportedOtherwise);
for (auto Ty : {s32, p0})
setAction({G_ICMP, 1, Ty}, Legal);
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 4aa7e150342..7424af9d5a5 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -41,10 +41,12 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
-#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Type.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Pass.h"
@@ -53,8 +55,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/Target/ARM/ARMMacroFusion.cpp b/lib/Target/ARM/ARMMacroFusion.cpp
index a34ed2cb5a2..5c9aad417ce 100644
--- a/lib/Target/ARM/ARMMacroFusion.cpp
+++ b/lib/Target/ARM/ARMMacroFusion.cpp
@@ -15,7 +15,7 @@
#include "ARMMacroFusion.h"
#include "ARMSubtarget.h"
#include "llvm/CodeGen/MacroFusion.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
namespace llvm {
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 4f330e3a884..f6996f098c0 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -31,6 +31,7 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDwarf.h"
@@ -38,7 +39,6 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <bitset>
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index d911dd97b1a..a0b98a43108 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -25,6 +25,7 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCInstrDesc.h"
@@ -34,7 +35,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
diff --git a/lib/Target/ARM/ThumbRegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp
index 15a56752333..d2bebb9eeec 100644
--- a/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -29,7 +29,7 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
diff --git a/lib/Target/AVR/AVRFrameLowering.h b/lib/Target/AVR/AVRFrameLowering.h
index 30ef441183a..a0ba6c95127 100644
--- a/lib/Target/AVR/AVRFrameLowering.h
+++ b/lib/Target/AVR/AVRFrameLowering.h
@@ -10,7 +10,7 @@
#ifndef LLVM_AVR_FRAME_LOWERING_H
#define LLVM_AVR_FRAME_LOWERING_H
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
diff --git a/lib/Target/AVR/AVRInstrInfo.h b/lib/Target/AVR/AVRInstrInfo.h
index eee8a92c619..354edcec346 100644
--- a/lib/Target/AVR/AVRInstrInfo.h
+++ b/lib/Target/AVR/AVRInstrInfo.h
@@ -14,7 +14,7 @@
#ifndef LLVM_AVR_INSTR_INFO_H
#define LLVM_AVR_INSTR_INFO_H
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "AVRRegisterInfo.h"
diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp
index 7099b29a8bc..b6ac93452cb 100644
--- a/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -18,7 +18,7 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/IR/Function.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "AVR.h"
#include "AVRInstrInfo.h"
diff --git a/lib/Target/BPF/BPFFrameLowering.h b/lib/Target/BPF/BPFFrameLowering.h
index 5db963f518b..b4ffa0713fa 100644
--- a/lib/Target/BPF/BPFFrameLowering.h
+++ b/lib/Target/BPF/BPFFrameLowering.h
@@ -14,7 +14,7 @@
#ifndef LLVM_LIB_TARGET_BPF_BPFFRAMELOWERING_H
#define LLVM_LIB_TARGET_BPF_BPFFRAMELOWERING_H
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
class BPFSubtarget;
diff --git a/lib/Target/BPF/BPFInstrInfo.h b/lib/Target/BPF/BPFInstrInfo.h
index c7048ab979b..f591f48a89a 100644
--- a/lib/Target/BPF/BPFInstrInfo.h
+++ b/lib/Target/BPF/BPFInstrInfo.h
@@ -15,7 +15,7 @@
#define LLVM_LIB_TARGET_BPF_BPFINSTRINFO_H
#include "BPFRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "BPFGenInstrInfo.inc"
diff --git a/lib/Target/BPF/BPFRegisterInfo.cpp b/lib/Target/BPF/BPFRegisterInfo.cpp
index 273843e9270..00d609e8960 100644
--- a/lib/Target/BPF/BPFRegisterInfo.cpp
+++ b/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -18,10 +18,10 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#define GET_REGINFO_TARGET_DESC
#include "BPFGenRegisterInfo.inc"
diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index 22794eb50e2..e28af5a844f 100644
--- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -14,9 +14,9 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cassert>
#include <vector>
diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index 501ac2c44bb..6336075917e 100644
--- a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -19,8 +19,8 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/PassSupport.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index 296edbe1eff..988718860c5 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -15,7 +15,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include <vector>
namespace llvm {
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index a5381c1fb1a..9b8970258a2 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -36,6 +36,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCInstrDesc.h"
@@ -47,7 +48,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 2f172340c4e..1558c2e9850 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -19,8 +19,8 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include <cstdint>
#include <vector>
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 29e2bc32dfb..2154a485dc6 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -161,9 +161,16 @@ namespace {
};
struct Simplifier {
- using Rule = std::function<Value * (Instruction *, LLVMContext &)>;
+ struct Rule {
+ using FuncType = std::function<Value* (Instruction*, LLVMContext&)>;
+ Rule(StringRef N, FuncType F) : Name(N), Fn(F) {}
+ StringRef Name; // For debugging.
+ FuncType Fn;
+ };
- void addRule(const Rule &R) { Rules.push_back(R); }
+ void addRule(StringRef N, const Rule::FuncType &F) {
+ Rules.push_back(Rule(N, F));
+ }
private:
struct WorkListType {
@@ -522,7 +529,7 @@ Value *Simplifier::simplify(Context &C) {
continue;
bool Changed = false;
for (Rule &R : Rules) {
- Value *W = R(U, C.Ctx);
+ Value *W = R.Fn(U, C.Ctx);
if (!W)
continue;
Changed = true;
@@ -1544,8 +1551,30 @@ Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At,
return R;
}
+static bool hasZeroSignBit(const Value *V) {
+ if (const auto *CI = dyn_cast<const ConstantInt>(V))
+ return (CI->getType()->getSignBit() & CI->getSExtValue()) == 0;
+ const Instruction *I = dyn_cast<const Instruction>(V);
+ if (!I)
+ return false;
+ switch (I->getOpcode()) {
+ case Instruction::LShr:
+ if (const auto SI = dyn_cast<const ConstantInt>(I->getOperand(1)))
+ return SI->getZExtValue() > 0;
+ return false;
+ case Instruction::Or:
+ case Instruction::Xor:
+ return hasZeroSignBit(I->getOperand(0)) &&
+ hasZeroSignBit(I->getOperand(1));
+ case Instruction::And:
+ return hasZeroSignBit(I->getOperand(0)) ||
+ hasZeroSignBit(I->getOperand(1));
+ }
+ return false;
+}
+
void PolynomialMultiplyRecognize::setupSimplifier() {
- Simp.addRule(
+ Simp.addRule("sink-zext",
// Sink zext past bitwise operations.
[](Instruction *I, LLVMContext &Ctx) -> Value* {
if (I->getOpcode() != Instruction::ZExt)
@@ -1566,7 +1595,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
B.CreateZExt(T->getOperand(0), I->getType()),
B.CreateZExt(T->getOperand(1), I->getType()));
});
- Simp.addRule(
+ Simp.addRule("xor/and -> and/xor",
// (xor (and x a) (and y a)) -> (and (xor x y) a)
[](Instruction *I, LLVMContext &Ctx) -> Value* {
if (I->getOpcode() != Instruction::Xor)
@@ -1584,7 +1613,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
And0->getOperand(1));
});
- Simp.addRule(
+ Simp.addRule("sink binop into select",
// (Op (select c x y) z) -> (select c (Op x z) (Op y z))
// (Op x (select c y z)) -> (select c (Op x y) (Op x z))
[](Instruction *I, LLVMContext &Ctx) -> Value* {
@@ -1610,7 +1639,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
}
return nullptr;
});
- Simp.addRule(
+ Simp.addRule("fold select-select",
// (select c (select c x y) z) -> (select c x z)
// (select c x (select c y z)) -> (select c x z)
[](Instruction *I, LLVMContext &Ctx) -> Value* {
@@ -1629,23 +1658,19 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
}
return nullptr;
});
- Simp.addRule(
+ Simp.addRule("or-signbit -> xor-signbit",
// (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
[](Instruction *I, LLVMContext &Ctx) -> Value* {
if (I->getOpcode() != Instruction::Or)
return nullptr;
- Instruction *LShr = dyn_cast<Instruction>(I->getOperand(0));
- if (!LShr || LShr->getOpcode() != Instruction::LShr)
- return nullptr;
- ConstantInt *One = dyn_cast<ConstantInt>(LShr->getOperand(1));
- if (!One || One->getZExtValue() != 1)
- return nullptr;
ConstantInt *Msb = dyn_cast<ConstantInt>(I->getOperand(1));
if (!Msb || Msb->getZExtValue() != Msb->getType()->getSignBit())
return nullptr;
- return IRBuilder<>(Ctx).CreateXor(LShr, Msb);
+ if (!hasZeroSignBit(I->getOperand(0)))
+ return nullptr;
+ return IRBuilder<>(Ctx).CreateXor(I->getOperand(0), Msb);
});
- Simp.addRule(
+ Simp.addRule("sink lshr into binop",
// (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
[](Instruction *I, LLVMContext &Ctx) -> Value* {
if (I->getOpcode() != Instruction::LShr)
@@ -1667,7 +1692,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
B.CreateLShr(BitOp->getOperand(0), S),
B.CreateLShr(BitOp->getOperand(1), S));
});
- Simp.addRule(
+ Simp.addRule("expose bitop-const",
// (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
[](Instruction *I, LLVMContext &Ctx) -> Value* {
auto IsBitOp = [](unsigned Op) -> bool {
@@ -1737,9 +1762,17 @@ bool PolynomialMultiplyRecognize::recognize() {
// XXX: Currently this approach can modify the loop before being 100% sure
// that the transformation can be carried out.
bool FoundPreScan = false;
+ auto FeedsPHI = [LoopB](const Value *V) -> bool {
+ for (const Value *U : V->users()) {
+ if (const auto *P = dyn_cast<const PHINode>(U))
+ if (P->getParent() == LoopB)
+ return true;
+ }
+ return false;
+ };
for (Instruction &In : *LoopB) {
SelectInst *SI = dyn_cast<SelectInst>(&In);
- if (!SI)
+ if (!SI || !FeedsPHI(SI))
continue;
Simplifier::Context C(SI);
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 93f1fd4109a..3c88eeeb8a4 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -24,12 +24,12 @@
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index 2525d272666..6cca5a849cc 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -20,8 +20,8 @@
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
#include <cassert>
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index d432bfef7ae..05865c43f2d 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -1706,28 +1706,27 @@ multiclass Loadxim_pat<PatFrag Load, ValueType VT, PatFrag ValueMod,
defm: Loadxgim_pat<Load, VT, ValueMod, ImmPred, MI>;
}
-// Patterns to select load reg reg-indexed: Rs + Rt<<u2.
-multiclass Loadxr_pat<PatFrag Load, ValueType VT, InstHexagon MI> {
- let AddedComplexity = 40 in
- def: Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))),
- (VT (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2))>;
-
- let AddedComplexity = 20 in
- def: Pat<(VT (Load (add I32:$Rs, I32:$Rt))),
- (VT (MI IntRegs:$Rs, IntRegs:$Rt, 0))>;
-}
-
-// Patterns to select load reg reg-indexed: Rs + Rt<<u2 with value modifier.
-multiclass Loadxrm_pat<PatFrag Load, ValueType VT, PatFrag ValueMod,
- InstHexagon MI> {
- let AddedComplexity = 40 in
- def: Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))),
- (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2)))>;
+// Pattern to select load reg reg-indexed: Rs + Rt<<u2.
+class Loadxr_shl_pat<PatFrag Load, ValueType VT, InstHexagon MI>
+ : Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))),
+ (VT (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2))>;
+
+// Pattern to select load reg reg-indexed: Rs + Rt<<0.
+class Loadxr_add_pat<PatFrag Load, ValueType VT, InstHexagon MI>
+ : Pat<(VT (Load (add I32:$Rs, I32:$Rt))),
+ (VT (MI IntRegs:$Rs, IntRegs:$Rt, 0))>;
+
+// Pattern to select load reg reg-indexed: Rs + Rt<<u2 with value modifier.
+class Loadxrm_shl_pat<PatFrag Load, ValueType VT, PatFrag ValueMod,
+ InstHexagon MI>
+ : Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))),
+ (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2)))>;
- let AddedComplexity = 20 in
- def: Pat<(VT (Load (add I32:$Rs, I32:$Rt))),
- (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, 0)))>;
-}
+// Pattern to select load reg reg-indexed: Rs + Rt<<0 with value modifier.
+class Loadxrm_add_pat<PatFrag Load, ValueType VT, PatFrag ValueMod,
+ InstHexagon MI>
+ : Pat<(VT (Load (add I32:$Rs, I32:$Rt))),
+ (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, 0)))>;
// Pattern to select load long-offset reg-indexed: Addr + Rt<<u2.
// Don't match for u2==0, instead use reg+imm for those cases.
@@ -1777,17 +1776,19 @@ let AddedComplexity = 20 in {
defm: Loadxi_pat<atomic_load_64, i64, anyimm3, L2_loadrd_io>;
}
-defm: Loadxim_pat<extloadi1, i64, ToZext64, anyimm0, L2_loadrub_io>;
-defm: Loadxim_pat<extloadi8, i64, ToZext64, anyimm0, L2_loadrub_io>;
-defm: Loadxim_pat<extloadi16, i64, ToZext64, anyimm1, L2_loadruh_io>;
-defm: Loadxim_pat<extloadi32, i64, ToZext64, anyimm2, L2_loadri_io>;
-defm: Loadxim_pat<zextloadi1, i64, ToZext64, anyimm0, L2_loadrub_io>;
-defm: Loadxim_pat<zextloadi8, i64, ToZext64, anyimm0, L2_loadrub_io>;
-defm: Loadxim_pat<zextloadi16, i64, ToZext64, anyimm1, L2_loadruh_io>;
-defm: Loadxim_pat<zextloadi32, i64, ToZext64, anyimm2, L2_loadri_io>;
-defm: Loadxim_pat<sextloadi8, i64, ToSext64, anyimm0, L2_loadrb_io>;
-defm: Loadxim_pat<sextloadi16, i64, ToSext64, anyimm1, L2_loadrh_io>;
-defm: Loadxim_pat<sextloadi32, i64, ToSext64, anyimm2, L2_loadri_io>;
+let AddedComplexity = 30 in {
+ defm: Loadxim_pat<extloadi1, i64, ToZext64, anyimm0, L2_loadrub_io>;
+ defm: Loadxim_pat<extloadi8, i64, ToZext64, anyimm0, L2_loadrub_io>;
+ defm: Loadxim_pat<extloadi16, i64, ToZext64, anyimm1, L2_loadruh_io>;
+ defm: Loadxim_pat<extloadi32, i64, ToZext64, anyimm2, L2_loadri_io>;
+ defm: Loadxim_pat<zextloadi1, i64, ToZext64, anyimm0, L2_loadrub_io>;
+ defm: Loadxim_pat<zextloadi8, i64, ToZext64, anyimm0, L2_loadrub_io>;
+ defm: Loadxim_pat<zextloadi16, i64, ToZext64, anyimm1, L2_loadruh_io>;
+ defm: Loadxim_pat<zextloadi32, i64, ToZext64, anyimm2, L2_loadri_io>;
+ defm: Loadxim_pat<sextloadi8, i64, ToSext64, anyimm0, L2_loadrb_io>;
+ defm: Loadxim_pat<sextloadi16, i64, ToSext64, anyimm1, L2_loadrh_io>;
+ defm: Loadxim_pat<sextloadi32, i64, ToSext64, anyimm2, L2_loadri_io>;
+}
let AddedComplexity = 60 in {
def: Loadxu_pat<extloadi8, i32, anyimm0, L4_loadrub_ur>;
@@ -1818,26 +1819,55 @@ let AddedComplexity = 60 in {
def: Loadxum_pat<extloadi32, i64, anyimm2, ToZext64, L4_loadri_ur>;
}
-defm: Loadxr_pat<extloadi8, i32, L4_loadrub_rr>;
-defm: Loadxr_pat<zextloadi8, i32, L4_loadrub_rr>;
-defm: Loadxr_pat<sextloadi8, i32, L4_loadrb_rr>;
-defm: Loadxr_pat<extloadi16, i32, L4_loadruh_rr>;
-defm: Loadxr_pat<zextloadi16, i32, L4_loadruh_rr>;
-defm: Loadxr_pat<sextloadi16, i32, L4_loadrh_rr>;
-defm: Loadxr_pat<load, i32, L4_loadri_rr>;
-defm: Loadxr_pat<load, i64, L4_loadrd_rr>;
-defm: Loadxr_pat<load, f32, L4_loadri_rr>;
-defm: Loadxr_pat<load, f64, L4_loadrd_rr>;
-
-defm: Loadxrm_pat<extloadi8, i64, ToZext64, L4_loadrub_rr>;
-defm: Loadxrm_pat<zextloadi8, i64, ToZext64, L4_loadrub_rr>;
-defm: Loadxrm_pat<sextloadi8, i64, ToSext64, L4_loadrb_rr>;
-defm: Loadxrm_pat<extloadi16, i64, ToZext64, L4_loadruh_rr>;
-defm: Loadxrm_pat<zextloadi16, i64, ToZext64, L4_loadruh_rr>;
-defm: Loadxrm_pat<sextloadi16, i64, ToSext64, L4_loadrh_rr>;
-defm: Loadxrm_pat<extloadi32, i64, ToZext64, L4_loadri_rr>;
-defm: Loadxrm_pat<zextloadi32, i64, ToZext64, L4_loadri_rr>;
-defm: Loadxrm_pat<sextloadi32, i64, ToSext64, L4_loadri_rr>;
+let AddedComplexity = 40 in {
+ def: Loadxr_shl_pat<extloadi8, i32, L4_loadrub_rr>;
+ def: Loadxr_shl_pat<zextloadi8, i32, L4_loadrub_rr>;
+ def: Loadxr_shl_pat<sextloadi8, i32, L4_loadrb_rr>;
+ def: Loadxr_shl_pat<extloadi16, i32, L4_loadruh_rr>;
+ def: Loadxr_shl_pat<zextloadi16, i32, L4_loadruh_rr>;
+ def: Loadxr_shl_pat<sextloadi16, i32, L4_loadrh_rr>;
+ def: Loadxr_shl_pat<load, i32, L4_loadri_rr>;
+ def: Loadxr_shl_pat<load, i64, L4_loadrd_rr>;
+ def: Loadxr_shl_pat<load, f32, L4_loadri_rr>;
+ def: Loadxr_shl_pat<load, f64, L4_loadrd_rr>;
+}
+
+let AddedComplexity = 20 in {
+ def: Loadxr_add_pat<extloadi8, i32, L4_loadrub_rr>;
+ def: Loadxr_add_pat<zextloadi8, i32, L4_loadrub_rr>;
+ def: Loadxr_add_pat<sextloadi8, i32, L4_loadrb_rr>;
+ def: Loadxr_add_pat<extloadi16, i32, L4_loadruh_rr>;
+ def: Loadxr_add_pat<zextloadi16, i32, L4_loadruh_rr>;
+ def: Loadxr_add_pat<sextloadi16, i32, L4_loadrh_rr>;
+ def: Loadxr_add_pat<load, i32, L4_loadri_rr>;
+ def: Loadxr_add_pat<load, i64, L4_loadrd_rr>;
+ def: Loadxr_add_pat<load, f32, L4_loadri_rr>;
+ def: Loadxr_add_pat<load, f64, L4_loadrd_rr>;
+}
+
+let AddedComplexity = 40 in {
+ def: Loadxrm_shl_pat<extloadi8, i64, ToZext64, L4_loadrub_rr>;
+ def: Loadxrm_shl_pat<zextloadi8, i64, ToZext64, L4_loadrub_rr>;
+ def: Loadxrm_shl_pat<sextloadi8, i64, ToSext64, L4_loadrb_rr>;
+ def: Loadxrm_shl_pat<extloadi16, i64, ToZext64, L4_loadruh_rr>;
+ def: Loadxrm_shl_pat<zextloadi16, i64, ToZext64, L4_loadruh_rr>;
+ def: Loadxrm_shl_pat<sextloadi16, i64, ToSext64, L4_loadrh_rr>;
+ def: Loadxrm_shl_pat<extloadi32, i64, ToZext64, L4_loadri_rr>;
+ def: Loadxrm_shl_pat<zextloadi32, i64, ToZext64, L4_loadri_rr>;
+ def: Loadxrm_shl_pat<sextloadi32, i64, ToSext64, L4_loadri_rr>;
+}
+
+let AddedComplexity = 20 in {
+ def: Loadxrm_add_pat<extloadi8, i64, ToZext64, L4_loadrub_rr>;
+ def: Loadxrm_add_pat<zextloadi8, i64, ToZext64, L4_loadrub_rr>;
+ def: Loadxrm_add_pat<sextloadi8, i64, ToSext64, L4_loadrb_rr>;
+ def: Loadxrm_add_pat<extloadi16, i64, ToZext64, L4_loadruh_rr>;
+ def: Loadxrm_add_pat<zextloadi16, i64, ToZext64, L4_loadruh_rr>;
+ def: Loadxrm_add_pat<sextloadi16, i64, ToSext64, L4_loadrh_rr>;
+ def: Loadxrm_add_pat<extloadi32, i64, ToZext64, L4_loadri_rr>;
+ def: Loadxrm_add_pat<zextloadi32, i64, ToZext64, L4_loadri_rr>;
+ def: Loadxrm_add_pat<sextloadi32, i64, ToSext64, L4_loadri_rr>;
+}
// Absolute address
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 7d961a238ae..da53a09a6fc 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -44,12 +44,12 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/PassSupport.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include <algorithm>
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index e491c757670..f29f321214c 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -26,13 +26,13 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
#include "llvm/MC/MachineLocation.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index 68484344fde..0ff3afff5f5 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -23,7 +23,7 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
using namespace llvm;
diff --git a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
index a0fdc70e141..52e5dcd4638 100644
--- a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
+++ b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
@@ -548,14 +548,13 @@ bool HexagonVectorLoopCarriedReuse::doVLCR() {
findValueToReuse();
if (ReuseCandidate.isDefined()) {
reuseValue();
- Changed = true;
- Continue = true;
- }
- std::for_each(Dependences.begin(), Dependences.end(),
- std::default_delete<DepChain>());
- } while (Continue);
- return Changed;
-}
+ Changed = true;
+ Continue = true;
+ }
+ llvm::for_each(Dependences, std::default_delete<DepChain>());
+ } while (Continue);
+ return Changed;
+}
void HexagonVectorLoopCarriedReuse::findDepChainFromPHI(Instruction *I,
DepChain &D) {
diff --git a/lib/Target/Hexagon/RDFGraph.cpp b/lib/Target/Hexagon/RDFGraph.cpp
index de58ddff339..22bb8841f5f 100644
--- a/lib/Target/Hexagon/RDFGraph.cpp
+++ b/lib/Target/Hexagon/RDFGraph.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/MC/MCInstrDesc.h"
@@ -28,7 +29,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
index 802232b0582..6b4fa777178 100644
--- a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
+++ b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
@@ -17,8 +17,8 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/Target/Lanai/LanaiFrameLowering.h b/lib/Target/Lanai/LanaiFrameLowering.h
index 2f9b6c3c158..ca690d513fc 100644
--- a/lib/Target/Lanai/LanaiFrameLowering.h
+++ b/lib/Target/Lanai/LanaiFrameLowering.h
@@ -15,7 +15,7 @@
#define LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H
#include "Lanai.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
diff --git a/lib/Target/Lanai/LanaiInstrInfo.h b/lib/Target/Lanai/LanaiInstrInfo.h
index 4387fe1af3c..f07fede67a4 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/lib/Target/Lanai/LanaiInstrInfo.h
@@ -15,7 +15,7 @@
#define LLVM_LIB_TARGET_LANAI_LANAIINSTRINFO_H
#include "LanaiRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "LanaiGenInstrInfo.inc"
diff --git a/lib/Target/Lanai/LanaiMemAluCombiner.cpp b/lib/Target/Lanai/LanaiMemAluCombiner.cpp
index 7259c02194c..c29c933db74 100644
--- a/lib/Target/Lanai/LanaiMemAluCombiner.cpp
+++ b/lib/Target/Lanai/LanaiMemAluCombiner.cpp
@@ -30,8 +30,8 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
#define GET_INSTRMAP_INFO
diff --git a/lib/Target/Lanai/LanaiRegisterInfo.cpp b/lib/Target/Lanai/LanaiRegisterInfo.cpp
index 6ea477dce3e..56a5e0ea2de 100644
--- a/lib/Target/Lanai/LanaiRegisterInfo.cpp
+++ b/lib/Target/Lanai/LanaiRegisterInfo.cpp
@@ -20,11 +20,11 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#define GET_REGINFO_TARGET_DESC
#include "LanaiGenRegisterInfo.inc"
diff --git a/lib/Target/Lanai/LanaiSubtarget.h b/lib/Target/Lanai/LanaiSubtarget.h
index 2732ef3097e..313d950e8aa 100644
--- a/lib/Target/Lanai/LanaiSubtarget.h
+++ b/lib/Target/Lanai/LanaiSubtarget.h
@@ -19,7 +19,7 @@
#include "LanaiInstrInfo.h"
#include "LanaiSelectionDAGInfo.h"
#include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/Target/Lanai/LanaiTargetMachine.h b/lib/Target/Lanai/LanaiTargetMachine.h
index ce1271d9dea..2fb1a053610 100644
--- a/lib/Target/Lanai/LanaiTargetMachine.h
+++ b/lib/Target/Lanai/LanaiTargetMachine.h
@@ -19,7 +19,7 @@
#include "LanaiInstrInfo.h"
#include "LanaiSelectionDAGInfo.h"
#include "LanaiSubtarget.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index fdc4aa52a19..8807101f37c 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -15,7 +15,7 @@
#define LLVM_LIB_TARGET_MSP430_MSP430FRAMELOWERING_H
#include "MSP430.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
class MSP430FrameLowering : public TargetFrameLowering {
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index d81f17e753c..45357f54c9c 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -15,7 +15,7 @@
#define LLVM_LIB_TARGET_MSP430_MSP430INSTRINFO_H
#include "MSP430RegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "MSP430GenInstrInfo.inc"
diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
index 97b5e810a1d..4935b80cfdd 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/lib/Target/MSP430/MSP430TargetMachine.h
@@ -16,7 +16,7 @@
#define LLVM_LIB_TARGET_MSP430_MSP430TARGETMACHINE_H
#include "MSP430Subtarget.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 002fa512b21..d8e2eef6a9f 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -535,7 +535,7 @@ static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
uint64_t Address,
const void *Decoder);
-static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair,
uint64_t Address,
const void *Decoder);
@@ -2481,10 +2481,8 @@ static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
return MCDisassembler::Success;
}
-static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair,
uint64_t Address, const void *Decoder) {
- unsigned RegPair = fieldFromInstruction(Insn, 7, 3);
-
switch (RegPair) {
default:
return MCDisassembler::Fail;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 12f7638594d..eae0f975080 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -1115,6 +1115,29 @@ MipsMCCodeEmitter::getMovePRegPairOpValue(const MCInst &MI, unsigned OpNo,
}
unsigned
+MipsMCCodeEmitter::getMovePRegSingleOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ assert(((OpNo == 2) || (OpNo == 3)) &&
+ "Unexpected OpNo for movep operand encoding!");
+
+ MCOperand Op = MI.getOperand(OpNo);
+ assert(Op.isReg() && "Operand of movep is not a register!");
+ switch (Op.getReg()) {
+ default:
+ llvm_unreachable("Unknown register for movep!");
+ case Mips::ZERO: return 0;
+ case Mips::S1: return 1;
+ case Mips::V0: return 2;
+ case Mips::V1: return 3;
+ case Mips::S0: return 4;
+ case Mips::S2: return 5;
+ case Mips::S3: return 6;
+ case Mips::S4: return 7;
+ }
+}
+
+unsigned
MipsMCCodeEmitter::getSimm23Lsl2Encoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index d12d3195521..1e840114b2b 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -252,6 +252,9 @@ public:
unsigned getMovePRegPairOpValue(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ unsigned getMovePRegSingleOpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
unsigned getSimm23Lsl2Encoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index aad6bf378ea..0bddba78145 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -246,8 +246,6 @@ void MipsMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
break;
case MEK_CALL_HI16:
case MEK_CALL_LO16:
- case MEK_DTPREL_HI:
- case MEK_DTPREL_LO:
case MEK_GOT:
case MEK_GOT_CALL:
case MEK_GOT_DISP:
@@ -263,14 +261,16 @@ void MipsMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
case MEK_NEG:
case MEK_PCREL_HI16:
case MEK_PCREL_LO16:
- case MEK_TLSLDM:
// If we do have nested target-specific expressions, they will be in
// a consecutive chain.
if (const MipsMCExpr *E = dyn_cast<const MipsMCExpr>(getSubExpr()))
E->fixELFSymbolsInTLSFixups(Asm);
break;
- case MEK_GOTTPREL:
+ case MEK_DTPREL_HI:
+ case MEK_DTPREL_LO:
+ case MEK_TLSLDM:
case MEK_TLSGD:
+ case MEK_GOTTPREL:
case MEK_TPREL_HI:
case MEK_TPREL_LO:
fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
diff --git a/lib/Target/Mips/MicroMips32r6InstrFormats.td b/lib/Target/Mips/MicroMips32r6InstrFormats.td
index 2f0933277e8..e1f1f9262b9 100644
--- a/lib/Target/Mips/MicroMips32r6InstrFormats.td
+++ b/lib/Target/Mips/MicroMips32r6InstrFormats.td
@@ -829,6 +829,21 @@ class POOL16C_NOT16_FM_MMR6 : MicroMipsR6Inst16 {
let Inst{3-0} = 0b0000;
}
+class POOL16C_MOVEP16_FM_MMR6 : MicroMipsR6Inst16 {
+ bits<3> dst_regs;
+ bits<3> rt;
+ bits<3> rs;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0b010001;
+ let Inst{9-7} = dst_regs;
+ let Inst{6-4} = rt;
+ let Inst{3} = rs{2};
+ let Inst{2} = 0b1;
+ let Inst{1-0} = rs{1-0};
+}
+
class POOL16C_OR16_XOR16_FM_MMR6<bits<4> op> : MicroMipsR6Inst16 {
bits<3> rt;
bits<3> rs;
diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td
index 425e75e14c8..49d6ae3f98a 100644
--- a/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -229,6 +229,7 @@ class SRL16_MMR6_ENC : SHIFT_FM_MM16<1>, MicroMipsR6Inst16;
class BREAK16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b011011>;
class LI16_MMR6_ENC : LI_FM_MM16;
class MOVE16_MMR6_ENC : MOVE_FM_MM16<0b000011>;
+class MOVEP_MMR6_ENC : POOL16C_MOVEP16_FM_MMR6;
class SDBBP16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b111011>;
class SUBU16_MMR6_ENC : POOL16A_SUBU16_FM_MMR6;
class XOR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1000>;
@@ -1204,6 +1205,7 @@ class LI16_MMR6_DESC : LoadImmMM16<"li16", li16_imm, GPRMM16Opnd>,
MMR6Arch<"li16">, MicroMipsR6Inst16, IsAsCheapAsAMove;
class MOVE16_MMR6_DESC : MoveMM16<"move16", GPR32Opnd>, MMR6Arch<"move16">,
MicroMipsR6Inst16;
+class MOVEP_MMR6_DESC : MovePMM16<"movep", GPRMM16OpndMoveP>, MMR6Arch<"movep">;
class SDBBP16_MMR6_DESC : BrkSdbbp16MM<"sdbbp16", II_SDBBP>, MMR6Arch<"sdbbp16">,
MicroMipsR6Inst16;
class SUBU16_MMR6_DESC : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
@@ -1679,6 +1681,8 @@ def LI16_MMR6 : StdMMR6Rel, LI16_MMR6_DESC, LI16_MMR6_ENC,
ISA_MICROMIPS32R6;
def MOVE16_MMR6 : StdMMR6Rel, MOVE16_MMR6_DESC, MOVE16_MMR6_ENC,
ISA_MICROMIPS32R6;
+def MOVEP_MMR6 : StdMMR6Rel, MOVEP_MMR6_DESC, MOVEP_MMR6_ENC,
+ ISA_MICROMIPS32R6;
def SDBBP16_MMR6 : StdMMR6Rel, SDBBP16_MMR6_DESC, SDBBP16_MMR6_ENC,
ISA_MICROMIPS32R6;
def SUBU16_MMR6 : StdMMR6Rel, SUBU16_MMR6_DESC, SUBU16_MMR6_ENC,
@@ -1879,3 +1883,10 @@ let AddedComplexity = 41 in {
}
def TAILCALL_MMR6 : TailCall<BC_MMR6, brtarget26_mm>, ISA_MICROMIPS32R6;
+
+def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
+ (TAILCALL_MMR6 tglobaladdr:$dst)>, ISA_MICROMIPS32R6;
+
+def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
+ (TAILCALL_MMR6 texternalsym:$dst)>, ISA_MICROMIPS32R6;
+
diff --git a/lib/Target/Mips/MicroMips64r6InstrInfo.td b/lib/Target/Mips/MicroMips64r6InstrInfo.td
index e0f4d833392..4f705feed0a 100644
--- a/lib/Target/Mips/MicroMips64r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips64r6InstrInfo.td
@@ -162,12 +162,11 @@ class DCLZ_MM64R6_DESC {
class DINSU_MM64R6_DESC : InsBase<"dinsu", GPR64Opnd, uimm5_plus32,
uimm5_inssize_plus1, immZExt5Plus32,
- immZExt5Plus1, MipsIns>;
+ immZExt5Plus1>;
class DINSM_MM64R6_DESC : InsBase<"dinsm", GPR64Opnd, uimm5, uimm_range_2_64,
- immZExt5, immZExtRange2To64, MipsIns>;
+ immZExt5, immZExtRange2To64>;
class DINS_MM64R6_DESC : InsBase<"dins", GPR64Opnd, uimm5_report_uimm6,
- uimm5_inssize_plus1, immZExt5, immZExt5Plus1,
- MipsIns>;
+ uimm5_inssize_plus1, immZExt5, immZExt5Plus1>;
class DMTC0_MM64R6_DESC : MTC0_MMR6_DESC_BASE<"dmtc0", COP0Opnd, GPR64Opnd,
II_DMTC0>;
class DMTC1_MM64R6_DESC : MTC1_MMR6_DESC_BASE<"dmtc1", FGR64Opnd, GPR64Opnd,
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 1f869db4efe..48c1d94d03c 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -631,7 +631,8 @@ def ADDIUSP_MM : AddImmUSP<"addiusp">, ADDIUSP_FM_MM16;
def MFHI16_MM : MoveFromHILOMM<"mfhi", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x10>;
def MFLO16_MM : MoveFromHILOMM<"mflo", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x12>;
def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>;
-def MOVEP_MM : MovePMM16<"movep", GPRMM16OpndMoveP>, MOVEP_FM_MM16;
+def MOVEP_MM : MovePMM16<"movep", GPRMM16OpndMoveP>, MOVEP_FM_MM16,
+ ISA_MICROMIPS_NOT_32R6_64R6;
def LI16_MM : LoadImmMM16<"li16", li16_imm, GPRMM16Opnd>, LI_FM_MM16,
IsAsCheapAsAMove;
def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>,
@@ -884,7 +885,7 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, immZExt5,
immZExt5Plus1, MipsExt>, EXT_FM_MM<0x2c>;
def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, uimm5_inssize_plus1,
- immZExt5, immZExt5Plus1, MipsIns>,
+ immZExt5, immZExt5Plus1>,
EXT_FM_MM<0x0c>;
/// Jump Instructions
@@ -1061,13 +1062,13 @@ let Predicates = [InMicroMips] in {
(LW_MM addr:$addr)>;
def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
(SUBu_MM GPR32:$lhs, GPR32:$rhs)>;
-
- def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
- (TAILCALL_MM tglobaladdr:$dst)>, ISA_MIPS1_NOT_32R6_64R6;
- def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
- (TAILCALL_MM texternalsym:$dst)>, ISA_MIPS1_NOT_32R6_64R6;
}
+def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
+ (TAILCALL_MM tglobaladdr:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
+def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
+ (TAILCALL_MM texternalsym:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
+
let AddedComplexity = 40 in {
def : MipsPat<(i32 (sextloadi16 addrRegImm:$a)),
(LH_MM addrRegImm:$a)>;
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index 76bca3df2bc..cb59e2ddb1c 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -30,7 +30,7 @@
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MachineLocation.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include <cassert>
#include <cstdint>
#include <vector>
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index bdb9eec4cc5..8ce47e3f669 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -17,8 +17,8 @@
#include "MipsRegisterInfo.h"
#include "MipsTargetMachine.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index 44771cbe8be..ff95f3c7228 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -22,6 +22,8 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/Function.h"
@@ -29,8 +31,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 04a050c2ff4..dbd47de4dad 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -341,13 +341,13 @@ let AdditionalPredicates = [NotInMicroMips] in {
// for dinsm and dinsu like binutils.
let DecoderMethod = "DecodeDINS" in {
def DINS : InsBase<"dins", GPR64Opnd, uimm6, uimm5_inssize_plus1,
- immZExt5, immZExt5Plus1, MipsIns>, EXT_FM<7>,
+ immZExt5, immZExt5Plus1>, EXT_FM<7>,
ISA_MIPS64R2;
def DINSU : InsBase<"dinsu", GPR64Opnd, uimm5_plus32, uimm5_inssize_plus1,
- immZExt5Plus32, immZExt5Plus1, MipsIns>,
+ immZExt5Plus32, immZExt5Plus1>,
EXT_FM<6>, ISA_MIPS64R2;
def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5, uimm_range_2_64,
- immZExt5, immZExtRange2To64, MipsIns>,
+ immZExt5, immZExtRange2To64>,
EXT_FM<5>, ISA_MIPS64R2;
}
}
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index bec0ae6ba4c..5edd12c0232 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -37,6 +37,7 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallingConv.h"
@@ -64,7 +65,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include <algorithm>
#include <cassert>
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index ef05166503b..27a85970da6 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -107,38 +107,31 @@ bool MipsFrameLowering::hasBP(const MachineFunction &MF) const {
return MFI.hasVarSizedObjects() && TRI->needsStackRealignment(MF);
}
+// Estimate the size of the stack, including the incoming arguments. We need to
+// account for register spills, local objects, reserved call frame and incoming
+// arguments. This is required to determine the largest possible positive offset
+// from $sp so that it can be determined if an emergency spill slot for stack
+// addresses is required.
uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
- int64_t Offset = 0;
+ int64_t Size = 0;
- // Iterate over fixed sized objects.
+ // Iterate over fixed sized objects which are incoming arguments.
for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
- Offset = std::max(Offset, -MFI.getObjectOffset(I));
+ if (MFI.getObjectOffset(I) > 0)
+ Size += MFI.getObjectSize(I);
// Conservatively assume all callee-saved registers will be saved.
for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) {
- unsigned Size = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R));
- Offset = alignTo(Offset + Size, Size);
+ unsigned RegSize = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R));
+ Size = alignTo(Size + RegSize, RegSize);
}
- unsigned MaxAlign = MFI.getMaxAlignment();
-
- // Check that MaxAlign is not zero if there is a stack object that is not a
- // callee-saved spill.
- assert(!MFI.getObjectIndexEnd() || MaxAlign);
-
- // Iterate over other objects.
- for (unsigned I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I)
- Offset = alignTo(Offset + MFI.getObjectSize(I), MaxAlign);
-
- // Call frame.
- if (MFI.adjustsStack() && hasReservedCallFrame(MF))
- Offset = alignTo(Offset + MFI.getMaxCallFrameSize(),
- std::max(MaxAlign, getStackAlignment()));
-
- return alignTo(Offset, getStackAlignment());
+ // Get the size of the rest of the frame objects and any possible reserved
+ // call frame, accounting for alignment.
+ return Size + MFI.estimateStackSize(MF);
}
// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index 8c4214c4c21..883c3267d51 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -15,7 +15,7 @@
#define LLVM_LIB_TARGET_MIPS_MIPSFRAMELOWERING_H
#include "Mips.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
class MipsSubtarget;
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 38b3c3fb160..d31385f42d6 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -27,8 +27,8 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/ISDOpcodes.h"
@@ -45,6 +45,8 @@
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
@@ -62,8 +64,6 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index a5ed1be3bee..c18e395f901 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -25,7 +25,7 @@
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include <cstdint>
#define GET_INSTRINFO_HEADER
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index c4c3eb760c5..3502dbcdae9 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -212,6 +212,8 @@ def HasMicroMips64r6 : Predicate<"Subtarget->inMicroMips64r6Mode()">,
AssemblerPredicate<"FeatureMicroMips,FeatureMips64r6">;
def InMips16Mode : Predicate<"Subtarget->inMips16Mode()">,
AssemblerPredicate<"FeatureMips16">;
+def NotInMips16Mode : Predicate<"!Subtarget->inMips16Mode()">,
+ AssemblerPredicate<"!FeatureMips16">;
def HasCnMips : Predicate<"Subtarget->hasCnMips()">,
AssemblerPredicate<"FeatureCnMips">;
def NotCnMips : Predicate<"!Subtarget->hasCnMips()">,
@@ -1544,7 +1546,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
PseudoInstExpansion<(JumpInst Opnd:$target)>;
class TailCallReg<RegisterOperand RO> :
- MipsPseudo<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>;
+ PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>;
}
class BAL_BR_Pseudo<Instruction RealInst> :
@@ -1726,12 +1728,13 @@ class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
[(set RO:$rt, (Op RO:$rs, PosImm:$pos, SizeImm:$size))], II_EXT,
FrmR, opstr>, ISA_MIPS32R2;
+// 'ins' and its' 64 bit variants are matched by C++ code.
class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
- Operand SizeOpnd, PatFrag PosImm, PatFrag SizeImm,
- SDPatternOperator Op = null_frag>:
+ Operand SizeOpnd, PatFrag PosImm, PatFrag SizeImm>:
InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size, RO:$src),
!strconcat(opstr, " $rt, $rs, $pos, $size"),
- [(set RO:$rt, (Op RO:$rs, PosImm:$pos, SizeImm:$size, RO:$src))],
+ [(set RO:$rt, (null_frag RO:$rs, PosImm:$pos, SizeImm:$size,
+ RO:$src))],
II_INS, FrmR, opstr>, ISA_MIPS32R2 {
let Constraints = "$src = $rt";
}
@@ -2086,7 +2089,7 @@ def BLTZALL : MMRel, BGEZAL_FT<"bltzall", brtarget, GPR32Opnd>,
BGEZAL_FM<0x12>, ISA_MIPS2_NOT_32R6_64R6;
def BAL_BR : BAL_BR_Pseudo<BGEZAL>;
-let Predicates = [NotInMicroMips] in {
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips] in {
def TAILCALL : TailCall<J, jmptarget>;
}
@@ -2103,6 +2106,7 @@ class PseudoIndirectBranchBase<RegisterOperand RO> :
let isBranch = 1;
let isIndirectBranch = 1;
bit isCTI = 1;
+ let Predicates = [NotInMips16Mode];
}
def PseudoIndirectBranch : PseudoIndirectBranchBase<GPR32Opnd>;
@@ -2236,7 +2240,7 @@ let AdditionalPredicates = [NotInMicroMips] in {
EXT_FM<0>;
def INS : MMRel, StdMMR6Rel, InsBase<"ins", GPR32Opnd, uimm5,
uimm5_inssize_plus1, immZExt5,
- immZExt5Plus1, MipsIns>,
+ immZExt5Plus1>,
EXT_FM<4>;
}
/// Move Control Registers From/To CPU Registers
@@ -2776,10 +2780,12 @@ def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
// (JALR GPR32:$dst)>;
// Tail call
-def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
- (TAILCALL tglobaladdr:$dst)>;
-def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
- (TAILCALL texternalsym:$dst)>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
+ (TAILCALL tglobaladdr:$dst)>;
+ def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
+ (TAILCALL texternalsym:$dst)>;
+}
// hi/lo relocs
multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu,
Register ZeroReg, RegisterOperand GPROpnd> {
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index 01c0cbf8262..3910adb7316 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -28,11 +28,11 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/RecyclingAllocator.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 9c64a0ecbb1..ec966afee0e 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -28,7 +28,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cstdint>
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 08fb3d7d435..f64d91aad85 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -616,6 +616,7 @@ def GPRMM16OpndZero : RegisterOperand<GPRMM16Zero> {
def GPRMM16OpndMoveP : RegisterOperand<GPRMM16MoveP> {
let ParserMatchClass = GPRMM16AsmOperandMoveP;
+ let EncoderMethod = "getMovePRegSingleOpValue";
}
def GPR64Opnd : RegisterOperand<GPR64> {
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 0b19b18449e..2d9cbabbc59 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -29,6 +29,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCDwarf.h"
@@ -37,7 +38,6 @@
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cassert>
@@ -893,10 +893,12 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
}
// Set scavenging frame index if necessary.
- uint64_t MaxSPOffset = MF.getInfo<MipsFunctionInfo>()->getIncomingArgSize() +
- estimateStackSize(MF);
+ uint64_t MaxSPOffset = estimateStackSize(MF);
- if (isInt<16>(MaxSPOffset))
+ // MSA has a minimum offset of 10 bits signed. If there is a variable
+ // sized object on the stack, the estimation cannot account for it.
+ if (isIntN(STI.hasMSA() ? 10 : 16, MaxSPOffset) &&
+ !MF.getFrameInfo().hasVarSizedObjects())
return;
const TargetRegisterClass &RC =
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 283fcaa73a7..3c6a7d7a665 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -905,6 +905,64 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
break;
}
+ // Manually match MipsISD::Ins nodes to get the correct instruction. It has
+ // to be done in this fashion so that we respect the differences between
+ // dins and dinsm, as the difference is that the size operand has the range
+ // 0 < size <= 32 for dins while dinsm has the range 2 <= size <= 64 which
+ // means SelectionDAGISel would have to test all the operands at once to
+ // match the instruction.
+ case MipsISD::Ins: {
+
+ // Sanity checking for the node operands.
+ if (Node->getValueType(0) != MVT::i32 && Node->getValueType(0) != MVT::i64)
+ return false;
+
+ if (Node->getNumOperands() != 4)
+ return false;
+
+ if (Node->getOperand(1)->getOpcode() != ISD::Constant ||
+ Node->getOperand(2)->getOpcode() != ISD::Constant)
+ return false;
+
+ MVT ResTy = Node->getSimpleValueType(0);
+ uint64_t Pos = Node->getConstantOperandVal(1);
+ uint64_t Size = Node->getConstantOperandVal(2);
+
+ // Size has to be >0 for 'ins', 'dins' and 'dinsu'.
+ if (!Size)
+ return false;
+
+ if (Pos + Size > 64)
+ return false;
+
+ if (ResTy != MVT::i32 && ResTy != MVT::i64)
+ return false;
+
+ unsigned Opcode = 0;
+ if (ResTy == MVT::i32) {
+ if (Pos + Size <= 32)
+ Opcode = Mips::INS;
+ } else {
+ if (Pos + Size <= 32)
+ Opcode = Mips::DINS;
+ else if (Pos < 32 && 1 < Size)
+ Opcode = Mips::DINSM;
+ else
+ Opcode = Mips::DINSU;
+ }
+
+ if (Opcode) {
+ SDValue Ops[4] = {
+ Node->getOperand(0), CurDAG->getTargetConstant(Pos, DL, MVT::i32),
+ CurDAG->getTargetConstant(Size, DL, MVT::i32), Node->getOperand(3)};
+
+ ReplaceNode(Node, CurDAG->getMachineNode(Opcode, DL, ResTy, Ops));
+ return true;
+ }
+
+ return false;
+ }
+
case MipsISD::ThreadPointer: {
EVT PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout());
unsigned RdhwrOpc, DestReg;
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 45d7f94f1d1..4dd9f7f219a 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -31,6 +31,7 @@
#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Intrinsics.h"
@@ -40,7 +41,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
#include <cassert>
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 86bd24166bb..2ff6b99e78f 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -23,6 +23,8 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/Function.h"
@@ -30,8 +32,6 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/lib/Target/Mips/MipsScheduleGeneric.td b/lib/Target/Mips/MipsScheduleGeneric.td
index 89cda676441..9621009ed1c 100644
--- a/lib/Target/Mips/MipsScheduleGeneric.td
+++ b/lib/Target/Mips/MipsScheduleGeneric.td
@@ -736,6 +736,7 @@ def : InstRW<[GenericDSPShort], (instregex "^MFHI_DSP_MM$")>;
def : InstRW<[GenericDSPShort], (instregex "^MFLO_DSP_MM$")>;
def : InstRW<[GenericDSPShort], (instregex "^MODSUB_MM$")>;
def : InstRW<[GenericDSPShort], (instregex "^MOVEP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MOVEP_MMR6$")>;
def : InstRW<[GenericDSPShort], (instregex "^MOVN_I_MM$")>;
def : InstRW<[GenericDSPShort], (instregex "^MOVZ_I_MM$")>;
def : InstRW<[GenericDSPShort], (instregex "^MSUBU_DSP_MM$")>;
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 6ced2f6967c..729f3ed7b79 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -20,8 +20,8 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MachineLocation.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h
index 320ca9a2f09..a802cf85d2e 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -14,7 +14,7 @@
#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H
#define LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
class NVPTXSubtarget;
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7b9acb20b75..ac4f2544fc3 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -3449,6 +3449,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
}
case Intrinsic::nvvm_atomic_load_add_f32:
+ case Intrinsic::nvvm_atomic_load_add_f64:
case Intrinsic::nvvm_atomic_load_inc_32:
case Intrinsic::nvvm_atomic_load_dec_32:
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index d284282e28c..18ba7684ae5 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -16,7 +16,7 @@
#include "NVPTX.h"
#include "NVPTXRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "NVPTXGenInstrInfo.inc"
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index f745b6f6635..478f3e9d057 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1095,6 +1095,12 @@ def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
(int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
+def atomic_load_add_f64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
+def atomic_load_add_f64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add",
atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>;
@@ -1121,6 +1127,13 @@ defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add",
defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add",
atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>;
+defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2<Float64Regs, ".global", ".f64", ".add",
+ atomic_load_add_f64_g, f64imm, fpimm, hasAtomAddF64>;
+defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2<Float64Regs, ".shared", ".f64", ".add",
+ atomic_load_add_f64_s, f64imm, fpimm, hasAtomAddF64>;
+defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2<Float64Regs, "", ".f64", ".add",
+ atomic_load_add_f64_gen, f64imm, fpimm, hasAtomAddF64>;
+
// atom_sub
def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
diff --git a/lib/Target/NVPTX/NVPTXPeephole.cpp b/lib/Target/NVPTX/NVPTXPeephole.cpp
index 4e902c0fb50..38437b20433 100644
--- a/lib/Target/NVPTX/NVPTXPeephole.cpp
+++ b/lib/Target/NVPTX/NVPTXPeephole.cpp
@@ -36,7 +36,7 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
using namespace llvm;
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 88288abe64f..3957d426653 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -20,7 +20,7 @@
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 8d46694fbe5..75573832988 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -18,8 +18,8 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MachineLocation.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index 7674135f0a7..54a72a688ee 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -17,7 +17,7 @@
#include "ManagedStringPool.h"
#include "NVPTXSubtarget.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
diff --git a/lib/Target/Nios2/Nios2FrameLowering.h b/lib/Target/Nios2/Nios2FrameLowering.h
index 2aaea678d9e..2d9e84b2c72 100644
--- a/lib/Target/Nios2/Nios2FrameLowering.h
+++ b/lib/Target/Nios2/Nios2FrameLowering.h
@@ -14,7 +14,7 @@
#define LLVM_LIB_TARGET_NIOS2_NIOS2FRAMELOWERING_H
#include "Nios2.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
class Nios2Subtarget;
diff --git a/lib/Target/Nios2/Nios2InstrInfo.h b/lib/Target/Nios2/Nios2InstrInfo.h
index 47e5e83a39d..6a0a050c839 100644
--- a/lib/Target/Nios2/Nios2InstrInfo.h
+++ b/lib/Target/Nios2/Nios2InstrInfo.h
@@ -18,7 +18,7 @@
#include "Nios2RegisterInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "Nios2GenInstrInfo.inc"
diff --git a/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
index 33085a42361..ac28d7ff497 100644
--- a/lib/Target/PowerPC/PPCBranchCoalescing.cpp
+++ b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
@@ -21,9 +21,9 @@
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
using namespace llvm;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index fa813db5fef..f845d5a9ac6 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -15,7 +15,7 @@
#include "PPC.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 8ea3689b08e..2092748ca1a 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -36,6 +36,7 @@
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DebugLoc.h"
@@ -53,7 +54,6 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include <algorithm>
#include <cassert>
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index f3e7b4af45d..62ade966145 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -51,6 +51,7 @@
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
@@ -82,7 +83,6 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
@@ -291,14 +291,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FROUND, MVT::f32, Legal);
}
- // PowerPC does not have BSWAP
+ // PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd
+ // to speed up scalar BSWAP64.
// CTPOP or CTTZ were introduced in P8/P9 respectivelly
setOperationAction(ISD::BSWAP, MVT::i32 , Expand);
- setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
if (Subtarget.isISA3_0()) {
+ setOperationAction(ISD::BSWAP, MVT::i64 , Custom);
setOperationAction(ISD::CTTZ , MVT::i32 , Legal);
setOperationAction(ISD::CTTZ , MVT::i64 , Legal);
} else {
+ setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
setOperationAction(ISD::CTTZ , MVT::i32 , Expand);
setOperationAction(ISD::CTTZ , MVT::i64 , Expand);
}
@@ -781,6 +783,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SRL, MVT::v1i128, Legal);
setOperationAction(ISD::SRA, MVT::v1i128, Expand);
}
+
+ if (Subtarget.hasP9Altivec()) {
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+ }
}
if (Subtarget.hasQPX()) {
@@ -7888,6 +7895,107 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
return DAG.getNode(ISD::BITCAST, dl, VT, T);
}
+/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
+/// by the VINSERTB instruction introduced in ISA 3.0, else just return default
+/// SDValue.
+SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
+ SelectionDAG &DAG) const {
+ const unsigned BytesInVector = 16;
+ bool IsLE = Subtarget.isLittleEndian();
+ SDLoc dl(N);
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+ unsigned ShiftElts = 0, InsertAtByte = 0;
+ bool Swap = false;
+
+ // Shifts required to get the byte we want at element 7.
+ unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1,
+ 0, 15, 14, 13, 12, 11, 10, 9};
+ unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
+ 1, 2, 3, 4, 5, 6, 7, 8};
+
+ ArrayRef<int> Mask = N->getMask();
+ int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+ // For each mask element, find out if we're just inserting something
+ // from V2 into V1 or vice versa.
+ // Possible permutations inserting an element from V2 into V1:
+ // X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ // 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ // ...
+ // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
+ // Inserting from V1 into V2 will be similar, except mask range will be
+ // [16,31].
+
+ bool FoundCandidate = false;
+ // If both vector operands for the shuffle are the same vector, the mask
+ // will contain only elements from the first one and the second one will be
+ // undef.
+ unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
+ // Go through the mask of half-words to find an element that's being moved
+ // from one vector to the other.
+ for (unsigned i = 0; i < BytesInVector; ++i) {
+ unsigned CurrentElement = Mask[i];
+ // If 2nd operand is undefined, we should only look for element 7 in the
+ // Mask.
+ if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
+ continue;
+
+ bool OtherElementsInOrder = true;
+ // Examine the other elements in the Mask to see if they're in original
+ // order.
+ for (unsigned j = 0; j < BytesInVector; ++j) {
+ if (j == i)
+ continue;
+ // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
+ // from V2 [16,31] and vice versa. Unless the 2nd operand is undefined,
+ // in which we always assume we're always picking from the 1st operand.
+ int MaskOffset =
+ (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
+ if (Mask[j] != OriginalOrder[j] + MaskOffset) {
+ OtherElementsInOrder = false;
+ break;
+ }
+ }
+ // If other elements are in original order, we record the number of shifts
+ // we need to get the element we want into element 7. Also record which byte
+ // in the vector we should insert into.
+ if (OtherElementsInOrder) {
+ // If 2nd operand is undefined, we assume no shifts and no swapping.
+ if (V2.isUndef()) {
+ ShiftElts = 0;
+ Swap = false;
+ } else {
+ // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
+ ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
+ : BigEndianShifts[CurrentElement & 0xF];
+ Swap = CurrentElement < BytesInVector;
+ }
+ InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
+ FoundCandidate = true;
+ break;
+ }
+ }
+
+ if (!FoundCandidate)
+ return SDValue();
+
+ // Candidate found, construct the proper SDAG sequence with VINSERTB,
+ // optionally with VECSHL if shift is required.
+ if (Swap)
+ std::swap(V1, V2);
+ if (V2.isUndef())
+ V2 = V1;
+ if (ShiftElts) {
+ SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
+ DAG.getConstant(ShiftElts, dl, MVT::i32));
+ return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl,
+ DAG.getConstant(InsertAtByte, dl, MVT::i32));
+ }
+ return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2,
+ DAG.getConstant(InsertAtByte, dl, MVT::i32));
+}
+
/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
/// by the VINSERTH instruction introduced in ISA 3.0, else just return default
/// SDValue.
@@ -8035,8 +8143,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
}
if (Subtarget.hasP9Altivec()) {
- SDValue NewISDNode = lowerToVINSERTH(SVOp, DAG);
- if (NewISDNode)
+ SDValue NewISDNode;
+ if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
+ return NewISDNode;
+
+ if ((NewISDNode = lowerToVINSERTB(SVOp, DAG)))
return NewISDNode;
}
@@ -8675,6 +8786,23 @@ SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const {
return Op;
}
+// Lower scalar BSWAP64 to xxbrd.
+SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ // MTVSRDD
+ Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
+ Op.getOperand(0));
+ // XXBRD
+ Op = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Op);
+ // MFVSRD
+ int VectorIndex = 0;
+ if (Subtarget.isLittleEndian())
+ VectorIndex = 1;
+ Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
+ DAG.getTargetConstant(VectorIndex, dl, MVT::i32));
+ return Op;
+}
+
SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -8719,11 +8847,29 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
"Should only be called for ISD::INSERT_VECTOR_ELT");
+
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
// We have legal lowering for constant indices but not for variable ones.
- if (C)
- return Op;
- return SDValue();
+ if (!C)
+ return SDValue();
+
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
+ if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+ SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
+ unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
+ unsigned InsertAtElement = C->getZExtValue();
+ unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
+ if (Subtarget.isLittleEndian()) {
+ InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
+ }
+ return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz,
+ DAG.getConstant(InsertAtByte, dl, MVT::i32));
+ }
+ return Op;
}
SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
@@ -9146,6 +9292,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SREM:
case ISD::UREM:
return LowerREM(Op, DAG);
+ case ISD::BSWAP:
+ return LowerBSWAP(Op, DAG);
}
}
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 1a5efeba4cf..bf9c4b8e63b 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -953,6 +953,7 @@ namespace llvm {
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
@@ -1079,6 +1080,11 @@ namespace llvm {
/// from one vector into the other.
SDValue lowerToVINSERTH(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
+ /// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be
+ /// handled by the VINSERTB instruction introduced in ISA 3.0. This is
+ /// essentially v16i8 vector version of VINSERTH.
+ SDValue lowerToVINSERTB(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
+
}; // end class PPCTargetLowering
namespace PPC {
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 506fac7dfc1..e751c149b0b 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -1312,7 +1312,12 @@ def VEXTUWLX : VX1_RT5_RA5_VB5<1677, "vextuwlx", []>;
def VEXTUWRX : VX1_RT5_RA5_VB5<1933, "vextuwrx", []>;
// Vector Insert Element Instructions
-def VINSERTB : VX1_VT5_UIM5_VB5<781, "vinsertb", []>;
+def VINSERTB : VXForm_1<781, (outs vrrc:$vD),
+ (ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB),
+ "vinsertb $vD, $vB, $UIM", IIC_VecGeneral,
+ [(set v16i8:$vD, (PPCvecinsert v16i8:$vDi, v16i8:$vB,
+ imm32SExt16:$UIM))]>,
+ RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
def VINSERTH : VXForm_1<845, (outs vrrc:$vD),
(ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB),
"vinserth $vD, $vB, $UIM", IIC_VecGeneral,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index ab86a54f6fe..565392f76e4 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -16,7 +16,7 @@
#include "PPC.h"
#include "PPCRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "PPCGenInstrInfo.inc"
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 1fc50d2c860..3261bc9bc53 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -2595,6 +2595,13 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
(f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
}
+ // Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead
+ // of f64
+ def : Pat<(v8i16 (PPCmtvsrz i32:$A)),
+ (v8i16 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;
+ def : Pat<(v16i8 (PPCmtvsrz i32:$A)),
+ (v16i8 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;
+
// Patterns for which instructions from ISA 3.0 are a better match
let Predicates = [IsLittleEndian, HasP9Vector] in {
def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index d46c1383297..78467e81795 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -28,6 +28,8 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
@@ -37,8 +39,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <cstdlib>
diff --git a/lib/Target/RISCV/RISCV.h b/lib/Target/RISCV/RISCV.h
index 1b6140203c8..884cb2e5014 100644
--- a/lib/Target/RISCV/RISCV.h
+++ b/lib/Target/RISCV/RISCV.h
@@ -15,15 +15,21 @@
#ifndef LLVM_LIB_TARGET_RISCV_RISCV_H
#define LLVM_LIB_TARGET_RISCV_RISCV_H
-#include "MCTargetDesc/RISCVMCTargetDesc.h"
-#include "llvm/Target/TargetMachine.h"
+#include "MCTargetDesc/RISCVBaseInfo.h"
namespace llvm {
class RISCVTargetMachine;
+class AsmPrinter;
+class FunctionPass;
class MCInst;
+class MCOperand;
class MachineInstr;
+class MachineOperand;
-void LowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI);
+void LowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+ const AsmPrinter &AP);
+bool LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
+ MCOperand &MCOp, const AsmPrinter &AP);
FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM);
}
diff --git a/lib/Target/RISCV/RISCV.td b/lib/Target/RISCV/RISCV.td
index 54aa570e13b..da919acad36 100644
--- a/lib/Target/RISCV/RISCV.td
+++ b/lib/Target/RISCV/RISCV.td
@@ -40,9 +40,7 @@ def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit]>;
//===----------------------------------------------------------------------===//
def RISCVInstrInfo : InstrInfo {
- // TODO: disable guessInstructionProperties when
- // https://reviews.llvm.org/D37065 lands.
- let guessInstructionProperties = 1;
+ let guessInstructionProperties = 0;
}
def RISCVAsmParser : AsmParser {
diff --git a/lib/Target/RISCV/RISCVAsmPrinter.cpp b/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 1c213b6c7e9..4808e6c73c5 100644
--- a/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -43,6 +43,11 @@ public:
bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
const MachineInstr *MI);
+
+ // Wrapper needed for tblgenned pseudo lowering.
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
+ return LowerRISCVMachineOperandToMCOperand(MO, MCOp, *this);
+ }
};
}
@@ -56,7 +61,7 @@ void RISCVAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
MCInst TmpInst;
- LowerRISCVMachineInstrToMCInst(MI, TmpInst);
+ LowerRISCVMachineInstrToMCInst(MI, TmpInst, *this);
EmitToStreamer(*OutStreamer, TmpInst);
}
diff --git a/lib/Target/RISCV/RISCVCallingConv.td b/lib/Target/RISCV/RISCVCallingConv.td
index e0c25e32e01..0b7a523424c 100644
--- a/lib/Target/RISCV/RISCVCallingConv.td
+++ b/lib/Target/RISCV/RISCVCallingConv.td
@@ -27,3 +27,6 @@ def CC_RISCV32 : CallingConv<[
]>;
def CSR : CalleeSavedRegs<(add X1, X3, X4, X8, X9, (sequence "X%u", 18, 27))>;
+
+// Needed for implementation of RISCVRegisterInfo::getNoPreservedMask()
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
diff --git a/lib/Target/RISCV/RISCVFrameLowering.h b/lib/Target/RISCV/RISCVFrameLowering.h
index 14772ddac4a..0b2c7a40298 100644
--- a/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/lib/Target/RISCV/RISCVFrameLowering.h
@@ -14,7 +14,7 @@
#ifndef LLVM_LIB_TARGET_RISCV_RISCVFRAMELOWERING_H
#define LLVM_LIB_TARGET_RISCV_RISCVFRAMELOWERING_H
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
class RISCVSubtarget;
@@ -30,6 +30,12 @@ public:
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
bool hasFP(const MachineFunction &MF) const override;
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override {
+ return MBB.erase(MI);
+ }
};
}
#endif
diff --git a/lib/Target/RISCV/RISCVISelLowering.cpp b/lib/Target/RISCV/RISCVISelLowering.cpp
index d76170b7b78..98f7aa16e2e 100644
--- a/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -49,8 +49,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setStackPointerRegisterToSaveRestore(RISCV::X2);
+ for (auto N : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD})
+ setLoadExtAction(N, XLenVT, MVT::i1, Promote);
+
// TODO: add all necessary setOperationAction calls.
+ setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
+ setOperationAction(ISD::BR_CC, XLenVT, Expand);
setBooleanContents(ZeroOrOneBooleanContent);
// Function alignments (log2).
@@ -63,6 +68,30 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
switch (Op.getOpcode()) {
default:
report_fatal_error("unimplemented operand");
+ case ISD::GlobalAddress:
+ return lowerGlobalAddress(Op, DAG);
+ }
+}
+
+SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT Ty = Op.getValueType();
+ GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
+ const GlobalValue *GV = N->getGlobal();
+ int64_t Offset = N->getOffset();
+
+ if (!isPositionIndependent() && !Subtarget.is64Bit()) {
+ SDValue GAHi =
+ DAG.getTargetGlobalAddress(GV, DL, Ty, Offset, RISCVII::MO_HI);
+ SDValue GALo =
+ DAG.getTargetGlobalAddress(GV, DL, Ty, Offset, RISCVII::MO_LO);
+ SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, GAHi), 0);
+ SDValue MNLo =
+ SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, GALo), 0);
+ return MNLo;
+ } else {
+ report_fatal_error("Unable to lowerGlobalAddress");
}
}
@@ -79,6 +108,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
default:
report_fatal_error("Unsupported calling convention");
case CallingConv::C:
+ case CallingConv::Fast:
break;
}
@@ -115,6 +145,135 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
return Chain;
}
+// Lower a call to a callseq_start + CALL + callseq_end chain, and add input
+// and output parameter nodes.
+SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &DL = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ CLI.IsTailCall = false;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsVarArg = CLI.IsVarArg;
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ if (IsVarArg) {
+ report_fatal_error("LowerCall with varargs not implemented");
+ }
+
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ // Analyze the operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+ ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV32);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = ArgCCInfo.getNextStackOffset();
+
+ for (auto &Arg : Outs) {
+ if (!Arg.Flags.isByVal())
+ continue;
+ report_fatal_error("Passing arguments byval not yet implemented");
+ }
+
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
+
+ // Copy argument values to their designated locations.
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SDValue StackPtr;
+ for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+ CCValAssign &VA = ArgLocs[I];
+ SDValue ArgValue = OutVals[I];
+
+ // Promote the value if needed.
+ // For now, only handle fully promoted arguments.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ default:
+ llvm_unreachable("Unknown loc info!");
+ }
+
+ if (VA.isRegLoc()) {
+ // Queue up the argument copies and emit them at the end.
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
+ } else {
+ assert(VA.isMemLoc() && "Argument not register or memory");
+ report_fatal_error("Passing arguments via the stack not yet implemented");
+ }
+ }
+
+ SDValue Glue;
+
+ // Build a sequence of copy-to-reg nodes, chained and glued together.
+ for (auto &Reg : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, Glue);
+ Glue = Chain.getValue(1);
+ }
+
+ if (isa<GlobalAddressSDNode>(Callee)) {
+ Callee = lowerGlobalAddress(Callee, DAG);
+ } else if (isa<ExternalSymbolSDNode>(Callee)) {
+ report_fatal_error(
+ "lowerExternalSymbol, needed for lowerCall, not yet handled");
+ }
+
+ // The first call operand is the chain and the second is the target address.
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ // Add argument registers to the end of the list so that they are
+ // known live into the call.
+ for (auto &Reg : RegsToPass)
+ Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
+
+ // Add a register mask operand representing the call-preserved registers.
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ // Glue the call to the argument copies, if any.
+ if (Glue.getNode())
+ Ops.push_back(Glue);
+
+ // Emit the call.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ Chain = DAG.getNode(RISCVISD::CALL, DL, NodeTys, Ops);
+ Glue = Chain.getValue(1);
+
+ // Mark the end of the call, which is glued to the call itself.
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getConstant(NumBytes, DL, PtrVT, true),
+ DAG.getConstant(0, DL, PtrVT, true),
+ Glue, DL);
+ Glue = Chain.getValue(1);
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
+ RetCCInfo.AnalyzeCallResult(Ins, RetCC_RISCV32);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (auto &VA : RVLocs) {
+ // Copy the value out, gluing the copy to the end of the call sequence.
+ SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
+ VA.getLocVT(), Glue);
+ Chain = RetValue.getValue(1);
+ Glue = RetValue.getValue(2);
+
+ InVals.push_back(Chain.getValue(0));
+ }
+
+ return Chain;
+}
+
SDValue
RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool IsVarArg,
@@ -165,6 +324,8 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
break;
case RISCVISD::RET_FLAG:
return "RISCVISD::RET_FLAG";
+ case RISCVISD::CALL:
+ return "RISCVISD::CALL";
}
return nullptr;
}
diff --git a/lib/Target/RISCV/RISCVISelLowering.h b/lib/Target/RISCV/RISCVISelLowering.h
index 9fed48fc04e..dfb4824cc18 100644
--- a/lib/Target/RISCV/RISCVISelLowering.h
+++ b/lib/Target/RISCV/RISCVISelLowering.h
@@ -24,7 +24,8 @@ class RISCVSubtarget;
namespace RISCVISD {
enum NodeType : unsigned {
FIRST_NUMBER = ISD::BUILTIN_OP_END,
- RET_FLAG
+ RET_FLAG,
+ CALL
};
}
@@ -52,10 +53,13 @@ private:
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
+ SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override {
return true;
}
+ SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
};
}
diff --git a/lib/Target/RISCV/RISCVInstrInfo.cpp b/lib/Target/RISCV/RISCVInstrInfo.cpp
index 92db5358ce4..5b4f4fcbb88 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -28,4 +28,50 @@
using namespace llvm;
-RISCVInstrInfo::RISCVInstrInfo() : RISCVGenInstrInfo() {}
+RISCVInstrInfo::RISCVInstrInfo()
+ : RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP) {}
+
+void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned DstReg,
+ unsigned SrcReg, bool KillSrc) const {
+ assert(RISCV::GPRRegClass.contains(DstReg, SrcReg) &&
+ "Impossible reg-to-reg copy");
+
+ BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(0);
+}
+
+void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned SrcReg, bool IsKill, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (I != MBB.end())
+ DL = I->getDebugLoc();
+
+ if (RC == &RISCV::GPRRegClass)
+ BuildMI(MBB, I, DL, get(RISCV::SW))
+ .addReg(SrcReg, getKillRegState(IsKill))
+ .addFrameIndex(FI)
+ .addImm(0);
+ else
+ llvm_unreachable("Can't store this register to stack slot");
+}
+
+void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DstReg, int FI,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (I != MBB.end())
+ DL = I->getDebugLoc();
+
+ if (RC == &RISCV::GPRRegClass)
+ BuildMI(MBB, I, DL, get(RISCV::LW), DstReg).addFrameIndex(FI).addImm(0);
+ else
+ llvm_unreachable("Can't load this register from stack slot");
+}
diff --git a/lib/Target/RISCV/RISCVInstrInfo.h b/lib/Target/RISCV/RISCVInstrInfo.h
index 50404d5554d..05c8378445c 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/lib/Target/RISCV/RISCVInstrInfo.h
@@ -15,7 +15,7 @@
#define LLVM_LIB_TARGET_RISCV_RISCVINSTRINFO_H
#include "RISCVRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "RISCVGenInstrInfo.inc"
@@ -26,7 +26,21 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
public:
RISCVInstrInfo();
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned DstReg, unsigned SrcReg,
+ bool KillSrc) const override;
+
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+ bool IsKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, unsigned DstReg,
+ int FrameIndex, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
};
}
-
#endif
diff --git a/lib/Target/RISCV/RISCVInstrInfo.td b/lib/Target/RISCV/RISCVInstrInfo.td
index 23adf1eda9d..23f218fda8f 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/lib/Target/RISCV/RISCVInstrInfo.td
@@ -17,8 +17,22 @@ include "RISCVInstrFormats.td"
// RISC-V specific DAG Nodes.
//===----------------------------------------------------------------------===//
-def RetFlag : SDNode<"RISCVISD::RET_FLAG", SDTNone,
- [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def SDT_RISCVCall : SDTypeProfile<0, -1, [SDTCisVT<0, XLenVT>]>;
+def SDT_RISCVCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
+def SDT_RISCVCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
+
+
+def Call : SDNode<"RISCVISD::CALL", SDT_RISCVCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+def CallSeqStart : SDNode<"ISD::CALLSEQ_START", SDT_RISCVCallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def CallSeqEnd : SDNode<"ISD::CALLSEQ_END", SDT_RISCVCallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def RetFlag : SDNode<"RISCVISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
//===----------------------------------------------------------------------===//
// Operand and SDNode transformation definitions.
@@ -67,7 +81,7 @@ def uimm12 : Operand<XLenVT> {
}
// A 13-bit signed immediate where the least significant bit is zero.
-def simm13_lsb0 : Operand<XLenVT> {
+def simm13_lsb0 : Operand<OtherVT> {
let ParserMatchClass = SImmAsmOperand<13, "Lsb0">;
let EncoderMethod = "getImmOpValueAsr1";
let DecoderMethod = "decodeSImmOperandAndLsl1<13>";
@@ -80,12 +94,30 @@ def uimm20 : Operand<XLenVT> {
}
// A 21-bit signed immediate where the least significant bit is zero.
-def simm21_lsb0 : Operand<XLenVT> {
+def simm21_lsb0 : Operand<OtherVT> {
let ParserMatchClass = SImmAsmOperand<21, "Lsb0">;
let EncoderMethod = "getImmOpValueAsr1";
let DecoderMethod = "decodeSImmOperandAndLsl1<21>";
}
+// Standalone (codegen-only) immleaf patterns.
+def simm32 : ImmLeaf<XLenVT, [{return isInt<32>(Imm);}]>;
+
+// Extract least significant 12 bits from an immediate value and sign extend
+// them.
+def LO12Sext : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(SignExtend64<12>(N->getZExtValue()),
+ SDLoc(N), N->getValueType(0));
+}]>;
+
+// Extract the most significant 20 bits from an immediate value. Add 1 if bit
+// 11 is 1, to compensate for the low 12 bits in the matching immediate addi
+// or ld/st being negative.
+def HI20 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(((N->getZExtValue()+0x800) >> 12) & 0xfffff,
+ SDLoc(N), N->getValueType(0));
+}]>;
+
//===----------------------------------------------------------------------===//
// Instruction Class Templates
//===----------------------------------------------------------------------===//
@@ -257,6 +289,12 @@ class PatGprUimm5<SDPatternOperator OpNode, RVInstIShift Inst>
: Pat<(OpNode GPR:$rs1, uimm5:$shamt),
(Inst GPR:$rs1, uimm5:$shamt)>;
+/// Immediates
+
+def : Pat<(simm12:$imm), (ADDI X0, simm12:$imm)>;
+// TODO: Add a pattern for immediates with all zeroes in the lower 12 bits.
+def : Pat<(simm32:$imm), (ADDI (LUI (HI20 imm:$imm)), (LO12Sext imm:$imm))>;
+
/// Simple arithmetic operations
def : PatGprGpr<add, ADD>;
@@ -284,6 +322,80 @@ def : PatGprSimm12<setult, SLTIU>;
/// Branches and jumps
+// Match `(brcond (CondOp ..), ..)` and lower to the appropriate RISC-V branch
+// instruction.
+class BccPat<PatFrag CondOp, RVInstB Inst>
+ : Pat<(brcond (i32 (CondOp GPR:$rs1, GPR:$rs2)), bb:$imm12),
+ (Inst GPR:$rs1, GPR:$rs2, simm13_lsb0:$imm12)>;
+
+def : BccPat<seteq, BEQ>;
+def : BccPat<setne, BNE>;
+def : BccPat<setlt, BLT>;
+def : BccPat<setge, BGE>;
+def : BccPat<setult, BLTU>;
+def : BccPat<setuge, BGEU>;
+
+class BccSwapPat<PatFrag CondOp, RVInst InstBcc>
+ : Pat<(brcond (i32 (CondOp GPR:$rs1, GPR:$rs2)), bb:$imm12),
+ (InstBcc GPR:$rs2, GPR:$rs1, bb:$imm12)>;
+
+// Condition codes that don't have matching RISC-V branch instructions, but
+// are trivially supported by swapping the two input operands
+def : BccSwapPat<setgt, BLT>;
+def : BccSwapPat<setle, BGE>;
+def : BccSwapPat<setugt, BLTU>;
+def : BccSwapPat<setule, BGEU>;
+
+// An extra pattern is needed for a brcond without a setcc (i.e. where the
+// condition was calculated elsewhere).
+def : Pat<(brcond GPR:$cond, bb:$imm12), (BNE GPR:$cond, X0, bb:$imm12)>;
+
+let isBarrier = 1, isBranch = 1, isTerminator = 1 in
+def PseudoBR : Pseudo<(outs), (ins simm21_lsb0:$imm20), [(br bb:$imm20)]>,
+ PseudoInstExpansion<(JAL X0, simm21_lsb0:$imm20)>;
+
+let isCall = 1, Defs=[X1] in
+def PseudoCALL : Pseudo<(outs), (ins GPR:$rs1), [(Call GPR:$rs1)]>,
+ PseudoInstExpansion<(JALR X1, GPR:$rs1, 0)>;
+
let isBarrier = 1, isReturn = 1, isTerminator = 1 in
def PseudoRET : Pseudo<(outs), (ins), [(RetFlag)]>,
PseudoInstExpansion<(JALR X0, X1, 0)>;
+
+/// Loads
+
+multiclass LdPat<PatFrag LoadOp, RVInst Inst> {
+ def : Pat<(LoadOp GPR:$rs1), (Inst GPR:$rs1, 0)>;
+ def : Pat<(LoadOp (add GPR:$rs1, simm12:$imm12)),
+ (Inst GPR:$rs1, simm12:$imm12)>;
+}
+
+defm : LdPat<sextloadi8, LB>;
+defm : LdPat<extloadi8, LB>;
+defm : LdPat<sextloadi16, LH>;
+defm : LdPat<extloadi16, LH>;
+defm : LdPat<load, LW>;
+defm : LdPat<zextloadi8, LBU>;
+defm : LdPat<zextloadi16, LHU>;
+
+/// Stores
+
+multiclass StPat<PatFrag StoreOp, RVInst Inst> {
+ def : Pat<(StoreOp GPR:$rs2, GPR:$rs1), (Inst GPR:$rs2, GPR:$rs1, 0)>;
+ def : Pat<(StoreOp GPR:$rs2, (add GPR:$rs1, simm12:$imm12)),
+ (Inst GPR:$rs2, GPR:$rs1, simm12:$imm12)>;
+}
+
+defm : StPat<truncstorei8, SB>;
+defm : StPat<truncstorei16, SH>;
+defm : StPat<store, SW>;
+
+/// Other pseudo-instructions
+
+// Pessimistically assume the stack pointer will be clobbered
+let Defs = [X2], Uses = [X2] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(CallSeqStart timm:$amt1, timm:$amt2)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(CallSeqEnd timm:$amt1, timm:$amt2)]>;
+} // Defs = [X2], Uses = [X2]
diff --git a/lib/Target/RISCV/RISCVMCInstLower.cpp b/lib/Target/RISCV/RISCVMCInstLower.cpp
index 1ac8d982ff9..ef0051ed56e 100644
--- a/lib/Target/RISCV/RISCVMCInstLower.cpp
+++ b/lib/Target/RISCV/RISCVMCInstLower.cpp
@@ -13,6 +13,8 @@
//===----------------------------------------------------------------------===//
#include "RISCV.h"
+#include "MCTargetDesc/RISCVMCExpr.h"
+#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/MC/MCAsmInfo.h"
@@ -24,27 +26,72 @@
using namespace llvm;
-void llvm::LowerRISCVMachineInstrToMCInst(const MachineInstr *MI,
- MCInst &OutMI) {
+static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
+ const AsmPrinter &AP) {
+ MCContext &Ctx = AP.OutContext;
+ RISCVMCExpr::VariantKind Kind;
+
+ switch (MO.getTargetFlags()) {
+ default:
+ llvm_unreachable("Unknown target flag on GV operand");
+ case RISCVII::MO_None:
+ Kind = RISCVMCExpr::VK_RISCV_None;
+ break;
+ case RISCVII::MO_LO:
+ Kind = RISCVMCExpr::VK_RISCV_LO;
+ break;
+ case RISCVII::MO_HI:
+ Kind = RISCVMCExpr::VK_RISCV_HI;
+ break;
+ }
+
+ const MCExpr *ME =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+
+ if (!MO.isJTI() && MO.getOffset())
+ ME = MCBinaryExpr::createAdd(
+ ME, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+
+ ME = RISCVMCExpr::create(ME, Kind, Ctx);
+ return MCOperand::createExpr(ME);
+}
+
+bool llvm::LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
+ MCOperand &MCOp,
+ const AsmPrinter &AP) {
+ switch (MO.getType()) {
+ default:
+ report_fatal_error("LowerRISCVMachineInstrToMCInst: unknown operand type");
+ case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
+ if (MO.isImplicit())
+ return false;
+ MCOp = MCOperand::createReg(MO.getReg());
+ break;
+ case MachineOperand::MO_RegisterMask:
+ // Regmasks are like implicit defs.
+ return false;
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::createImm(MO.getImm());
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = MCOperand::createExpr(
+ MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), AP.OutContext));
+ break;
+ case MachineOperand::MO_GlobalAddress:
+ MCOp = lowerSymbolOperand(MO, AP.getSymbol(MO.getGlobal()), AP);
+ break;
+ }
+ return true;
+}
+
+void llvm::LowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+ const AsmPrinter &AP) {
OutMI.setOpcode(MI->getOpcode());
for (const MachineOperand &MO : MI->operands()) {
MCOperand MCOp;
- switch (MO.getType()) {
- default:
- report_fatal_error(
- "LowerRISCVMachineInstrToMCInst: unknown operand type");
- case MachineOperand::MO_Register:
- // Ignore all implicit register operands.
- if (MO.isImplicit())
- continue;
- MCOp = MCOperand::createReg(MO.getReg());
- break;
- case MachineOperand::MO_Immediate:
- MCOp = MCOperand::createImm(MO.getImm());
- break;
- }
-
- OutMI.addOperand(MCOp);
+ if (LowerRISCVMachineOperandToMCOperand(MO, MCOp, AP))
+ OutMI.addOperand(MCOp);
}
}
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.cpp b/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 4f6c528061c..cd658d7e2d9 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -18,9 +18,9 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#define GET_REGINFO_TARGET_DESC
#include "RISCVGenRegisterInfo.inc"
@@ -50,12 +50,47 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
return Reserved;
}
+const uint32_t *RISCVRegisterInfo::getNoPreservedMask() const {
+ return CSR_NoRegs_RegMask;
+}
+
void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
- report_fatal_error("Subroutines not supported yet");
+ // TODO: this implementation is a temporary placeholder which does just
+ // enough to allow other aspects of code generation to be tested
+
+ assert(SPAdj == 0 && "Unexpected non-zero SPAdj value");
+
+ MachineInstr &MI = *II;
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ DebugLoc DL = MI.getDebugLoc();
+
+ unsigned FrameReg = getFrameRegister(MF);
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ int Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg);
+ Offset += MI.getOperand(FIOperandNum + 1).getImm();
+
+ assert(TFI->hasFP(MF) && "eliminateFrameIndex currently requires hasFP");
+
+ // Offsets must be directly encoded in a 12-bit immediate field
+ if (!isInt<12>(Offset)) {
+ report_fatal_error(
+ "Frame offsets outside of the signed 12-bit range not supported");
+ }
+
+ MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+ return;
}
unsigned RISCVRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
return RISCV::X8;
}
+
+const uint32_t *
+RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & /*MF*/,
+ CallingConv::ID /*CC*/) const {
+ return CSR_RegMask;
+}
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.h b/lib/Target/RISCV/RISCVRegisterInfo.h
index 94af9f44ecd..d9de9bf8c76 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -25,10 +25,15 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
RISCVRegisterInfo(unsigned HwMode);
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID) const override;
+
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
BitVector getReservedRegs(const MachineFunction &MF) const override;
+ const uint32_t *getNoPreservedMask() const override;
+
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
unsigned FIOperandNum,
RegScavenger *RS = nullptr) const override;
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index df819ccd15d..6948b72747c 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -19,8 +19,8 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index ac0e69ccde1..6098afa6898 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -15,7 +15,7 @@
#define LLVM_LIB_TARGET_SPARC_SPARCFRAMELOWERING_H
#include "Sparc.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
index c053cc4c475..524b5d05416 100644
--- a/lib/Target/Sparc/SparcInstrInfo.h
+++ b/lib/Target/Sparc/SparcInstrInfo.h
@@ -15,7 +15,7 @@
#define LLVM_LIB_TARGET_SPARC_SPARCINSTRINFO_H
#include "SparcRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "SparcGenInstrInfo.inc"
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index 37a1fdf4d77..b9647eaa3d5 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -20,10 +20,10 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index bfbdb8d0b44..ad6b55a9fc9 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -19,7 +19,7 @@
#include "SparcInstrInfo.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <string>
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h
index 91c5a5d53a1..a75d111b029 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -11,7 +11,7 @@
#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H
#include "llvm/ADT/IndexedMap.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
class SystemZTargetMachine;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 4533f4fdf21..19ce7776fed 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -27,12 +27,12 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cassert>
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index b8be1f5f392..216139eb7c7 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -20,7 +20,7 @@
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include <cstdint>
#define GET_INSTRINFO_HEADER
diff --git a/lib/Target/SystemZ/SystemZLDCleanup.cpp b/lib/Target/SystemZ/SystemZLDCleanup.cpp
index d4cd89ce590..27c72b41e4f 100644
--- a/lib/Target/SystemZ/SystemZLDCleanup.cpp
+++ b/lib/Target/SystemZ/SystemZLDCleanup.cpp
@@ -19,7 +19,7 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 05f93ce5162..a44fae523fe 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -13,7 +13,7 @@
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
using namespace llvm;
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index a4d9421e08a..6d50369e587 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -323,6 +323,11 @@ unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) const {
return 0;
}
+bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
+ EVT VT = TLI->getValueType(DL, DataType);
+ return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
+}
+
int SystemZTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 28821a2ca11..4b11a6f0a83 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -62,6 +62,7 @@ public:
unsigned getPrefetchDistance() { return 2000; }
unsigned getMinPrefetchStride() { return 2048; }
+ bool hasDivRemOp(Type *DataType, bool IsSigned);
bool prefersVectorizedAddressing() { return false; }
bool LSRWithInstrQueries() { return true; }
bool supportsEfficientVectorElementLoadStore() { return true; }
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 5dcb89477a3..b24888fa9cb 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -167,6 +167,13 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
if (GV && !GV->isDeclarationForLinker())
return true;
+ // A symbol marked nonlazybind should not be accessed with a plt. If the
+ // symbol turns out to be external, the linker will convert a direct
+ // access to an access via the plt, so don't assume it is local.
+ const Function *F = dyn_cast_or_null<Function>(GV);
+ if (F && F->hasFnAttribute(Attribute::NonLazyBind))
+ return false;
+
bool IsTLS = GV && GV->isThreadLocal();
bool IsAccessViaCopyRelocs =
Options.MCOptions.MCPIECopyRelocations && GV && isa<GlobalVariable>(GV);
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 211358ad66c..ee60c8f3a7a 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -267,12 +267,11 @@ bool WebAssemblyAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
if (AsmVariant != 0)
report_fatal_error("There are no defined alternate asm variants");
- if (!ExtraCode) {
- // TODO: For now, we just hard-code 0 as the constant offset; teach
- // SelectInlineAsmMemoryOperand how to do address mode matching.
- OS << "0(" + regToString(MI->getOperand(OpNo)) + ')';
- return false;
- }
+ // The current approach to inline asm is that "r" constraints are expressed
+ // as local indices, rather than values on the operand stack. This simplifies
+ // using "r" as it eliminates the need to push and pop the values in a
+ // particular order, however it also makes it impossible to have an "m"
+ // constraint. So we don't support it.
return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
}
diff --git a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 41249117ae0..e2edb924d4d 100644
--- a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -294,6 +294,17 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
unsigned OldReg = MO.getReg();
+ // Inline asm may have a def in the middle of the operands. Our contract
+ // with inline asm register operands is to provide local indices as
+ // immediates.
+ if (MO.isDef()) {
+ assert(MI.getOpcode() == TargetOpcode::INLINEASM);
+ unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+ MRI.removeRegOperandFromUseList(&MO);
+ MO = MachineOperand::CreateImm(LocalId);
+ continue;
+ }
+
// If we see a stackified register, prepare to insert subsequent
// get_locals before the start of its tree.
if (MFI.isVRegStackified(OldReg)) {
@@ -301,6 +312,15 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
continue;
}
+ // Our contract with inline asm register operands is to provide local
+ // indices as immediates.
+ if (MI.getOpcode() == TargetOpcode::INLINEASM) {
+ unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+ MRI.removeRegOperandFromUseList(&MO);
+ MO = MachineOperand::CreateImm(LocalId);
+ continue;
+ }
+
// Insert a get_local.
unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index f516a6b260d..e67b1c88b58 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -541,7 +541,7 @@ unsigned WebAssemblyFastISel::getRegForUnsignedValue(const Value *V) {
unsigned WebAssemblyFastISel::getRegForSignedValue(const Value *V) {
MVT::SimpleValueType From = getSimpleType(V->getType());
MVT::SimpleValueType To = getLegalType(From);
- return zeroExtend(getRegForValue(V), V, From, To);
+ return signExtend(getRegForValue(V), V, From, To);
}
unsigned WebAssemblyFastISel::getRegForPromotedValue(const Value *V,
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
index bf326fce88f..4cc7f5ae058 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -16,7 +16,7 @@
#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYFRAMELOWERING_H
#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYFRAMELOWERING_H
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
class MachineFrameInfo;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
index df6c937a364..eb74106336e 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -17,7 +17,7 @@
#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYINSTRINFO_H
#include "WebAssemblyRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "WebAssemblyGenInstrInfo.inc"
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index 9367464c806..5e7ebd19fac 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -24,7 +24,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b8ea2f01133..2a784c9822a 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -2791,7 +2791,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
isParsingIntelSyntax())) {
default: llvm_unreachable("Unexpected match result!");
case Match_Success:
- if (validateInstruction(Inst, Operands))
+ if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
return true;
// Some instructions need post-processing to, for example, tweak which
// encoding is selected. Loop on it while changes happen so the
@@ -3082,7 +3082,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
// instruction will already have been filled in correctly, since the failing
// matches won't have modified it).
if (NumSuccessfulMatches == 1) {
- if (validateInstruction(Inst, Operands))
+ if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
return true;
// Some instructions need post-processing to, for example, tweak which
// encoding is selected. Loop on it while changes happen so the individual
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index 6ff1136cd85..0c99dbbe328 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -54,12 +54,12 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
if (TSFlags & X86II::LOCK)
OS << "\tlock\t";
if (!(TSFlags & X86II::LOCK) && Flags & X86::IP_HAS_LOCK)
- OS << "\tlock\n";
+ OS << "\tlock\t";
if (Flags & X86::IP_HAS_REPEAT_NE)
- OS << "\trepne\n";
+ OS << "\trepne\t";
else if (Flags & X86::IP_HAS_REPEAT)
- OS << "\trep\n";
+ OS << "\trep\t";
// Output CALLpcrel32 as "callq" in 64-bit mode.
// In Intel annotation it's always emitted as "call".
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 464941a1bab..1f02600a798 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -41,13 +41,13 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
uint64_t TSFlags = Desc.TSFlags;
if (TSFlags & X86II::LOCK)
- OS << "\tlock\n";
+ OS << "\tlock\t";
unsigned Flags = MI->getFlags();
if (Flags & X86::IP_HAS_REPEAT_NE)
- OS << "\trepne\n";
+ OS << "\trepne\t";
else if (Flags & X86::IP_HAS_REPEAT)
- OS << "\trep\n";
+ OS << "\trep\t";
printInstruction(MI, OS);
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index f4021d7639b..34f2956e0c0 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -116,9 +116,15 @@ def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX",
def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
"Enable AVX2 instructions",
[FeatureAVX]>;
+def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
+ "Enable three-operand fused multiple-add",
+ [FeatureAVX]>;
+def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
+ "Support 16-bit floating point conversion instructions",
+ [FeatureAVX]>;
def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
"Enable AVX-512 instructions",
- [FeatureAVX2]>;
+ [FeatureAVX2, FeatureFMA, FeatureF16C]>;
def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true",
"Enable AVX-512 Exponential and Reciprocal Instructions",
[FeatureAVX512]>;
@@ -154,9 +160,6 @@ def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
"Enable packed carry-less multiplication instructions",
[FeatureSSE2]>;
-def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
- "Enable three-operand fused multiple-add",
- [FeatureAVX]>;
def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true",
"Enable four-operand fused multiple-add",
[FeatureAVX, FeatureSSE4A]>;
@@ -177,9 +180,6 @@ def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true",
"Support MOVBE instruction">;
def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
"Support RDRAND instruction">;
-def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
- "Support 16-bit floating point conversion instructions",
- [FeatureAVX]>;
def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
"Support FS/GS Base instructions">;
def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 34e384ba311..d8b5b7d3fc0 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -34,13 +34,13 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCDwarf.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include <cassert>
#include <cstddef>
diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp
index 7beb9c6e357..54f937bcda3 100644
--- a/lib/Target/X86/X86CallLowering.cpp
+++ b/lib/Target/X86/X86CallLowering.cpp
@@ -34,6 +34,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DataLayout.h"
@@ -41,7 +42,6 @@
#include "llvm/IR/Value.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/LowLevelTypeImpl.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cassert>
#include <cstdint>
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp
index b2cd622b1e8..e75276960cc 100644
--- a/lib/Target/X86/X86CmovConversion.cpp
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -57,6 +57,7 @@
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/MCSchedule.h"
@@ -64,7 +65,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp
index 744510a3a3b..6dd4631a484 100644
--- a/lib/Target/X86/X86EvexToVex.cpp
+++ b/lib/Target/X86/X86EvexToVex.cpp
@@ -171,7 +171,7 @@ static void performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
case X86::VALIGNDZ128rri:
case X86::VALIGNDZ128rmi:
case X86::VALIGNQZ128rri:
- case X86::VALIGNQZ128rmi:
+ case X86::VALIGNQZ128rmi: {
assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) &&
"Unexpected new opcode!");
unsigned Scale = (Opc == X86::VALIGNQZ128rri ||
@@ -180,6 +180,24 @@ static void performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
Imm.setImm(Imm.getImm() * Scale);
break;
}
+ case X86::VSHUFF32X4Z256rmi:
+ case X86::VSHUFF32X4Z256rri:
+ case X86::VSHUFF64X2Z256rmi:
+ case X86::VSHUFF64X2Z256rri:
+ case X86::VSHUFI32X4Z256rmi:
+ case X86::VSHUFI32X4Z256rri:
+ case X86::VSHUFI64X2Z256rmi:
+ case X86::VSHUFI64X2Z256rri: {
+ assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr ||
+ NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) &&
+ "Unexpected new opcode!");
+ MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1);
+ int64_t ImmVal = Imm.getImm();
+ // Set bit 5, move bit 1 to bit 4, copy bit 0.
+ Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1));
+ break;
+ }
+ }
}
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index b2b5a78fcdb..9664c931c35 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -55,9 +55,9 @@
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
#define FIXUPBW_DESC "X86 Byte/Word Instruction Fixup"
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 9f649dad8bc..bbc2bffdb70 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -22,9 +22,9 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
namespace llvm {
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 5582526541b..7bcbe199124 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -37,11 +37,11 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 988f2967401..86e65b83ffa 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1562,6 +1562,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
bool HasFP = hasFP(MF);
uint64_t NumBytes = 0;
+ bool NeedsDwarfCFI =
+ (!MF.getTarget().getTargetTriple().isOSDarwin() &&
+ !MF.getTarget().getTargetTriple().isOSWindows()) &&
+ (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry());
+
if (IsFunclet) {
assert(HasFP && "EH funclets without FP not yet implemented");
NumBytes = getWinEHFuncletFrameSize(MF);
@@ -1584,6 +1589,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
MachineFramePtr)
.setMIFlag(MachineInstr::FrameDestroy);
+ if (NeedsDwarfCFI) {
+ unsigned DwarfStackPtr =
+ TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
+ BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfa(
+ nullptr, DwarfStackPtr, -SlotSize));
+ --MBBI;
+ }
}
MachineBasicBlock::iterator FirstCSPop = MBBI;
@@ -1647,6 +1659,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
} else if (NumBytes) {
// Adjust stack pointer back: ESP += numbytes.
emitSPUpdate(MBB, MBBI, NumBytes, /*InEpilogue=*/true);
+ if (!hasFP(MF) && NeedsDwarfCFI) {
+ // Define the current CFA rule to use the provided offset.
+ BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
+ nullptr, -CSSize - SlotSize));
+ }
--MBBI;
}
@@ -1659,6 +1676,23 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (NeedsWin64CFI && MF.hasWinCFI())
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
+ if (!hasFP(MF) && NeedsDwarfCFI) {
+ MBBI = FirstCSPop;
+ int64_t Offset = -CSSize - SlotSize;
+ // Mark callee-saved pop instruction.
+ // Define the current CFA rule to use the provided offset.
+ while (MBBI != MBB.end()) {
+ MachineBasicBlock::iterator PI = MBBI;
+ unsigned Opc = PI->getOpcode();
+ ++MBBI;
+ if (Opc == X86::POP32r || Opc == X86::POP64r) {
+ Offset += SlotSize;
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createDefCfaOffset(nullptr, Offset));
+ }
+ }
+ }
+
if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
// Add the return addr area delta back since we are not tail calling.
int Offset = -1 * X86FI->getTCReturnAddrDelta();
@@ -2577,6 +2611,7 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
unsigned Regs[2];
unsigned FoundRegs = 0;
+ auto &MRI = MBB.getParent()->getRegInfo();
auto RegMask = Prev->getOperand(1);
auto &RegClass =
@@ -2590,6 +2625,10 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
if (!RegMask.clobbersPhysReg(Candidate))
continue;
+ // Don't clobber reserved registers
+ if (MRI.isReserved(Candidate))
+ continue;
+
bool IsDef = false;
for (const MachineOperand &MO : Prev->implicit_operands()) {
if (MO.isReg() && MO.isDef() &&
@@ -2835,6 +2874,15 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
return MBBI;
}
+int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
+ return TRI->getSlotSize();
+}
+
+unsigned X86FrameLowering::getInitialCFARegister(const MachineFunction &MF)
+ const {
+ return TRI->getDwarfRegNum(StackPtr, true);
+}
+
namespace {
// Struct used by orderFrameObjects to help sort the stack objects.
struct X86FrameSortingObject {
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 38ac96e16d4..2bce79262d0 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -14,7 +14,7 @@
#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
@@ -168,6 +168,10 @@ public:
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, bool RestoreSP = false) const;
+ int getInitialCFAOffset(const MachineFunction &MF) const override;
+
+ unsigned getInitialCFARegister(const MachineFunction &MF) const override;
+
private:
uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index e43fd508de3..0cbf7601790 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -449,15 +449,15 @@ namespace {
// Returns true if this masked compare can be implemented legally with this
// type.
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
- if (N->getOpcode() == X86ISD::PCMPEQM ||
- N->getOpcode() == X86ISD::PCMPGTM ||
- N->getOpcode() == X86ISD::CMPM ||
- N->getOpcode() == X86ISD::CMPMU) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == X86ISD::PCMPEQM || Opcode == X86ISD::PCMPGTM ||
+ Opcode == X86ISD::CMPM || Opcode == X86ISD::TESTM ||
+ Opcode == X86ISD::TESTNM || Opcode == X86ISD::CMPMU) {
// We can get 256-bit 8 element types here without VLX being enabled. When
// this happens we will use 512-bit operations and the mask will not be
// zero extended.
- if (N->getOperand(0).getValueType() == MVT::v8i32 ||
- N->getOperand(0).getValueType() == MVT::v8f32)
+ EVT OpVT = N->getOperand(0).getValueType();
+ if (OpVT == MVT::v8i32 || OpVT == MVT::v8f32)
return Subtarget->hasVLX();
return true;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b178ad6c13e..22b4d7997fa 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -380,8 +380,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Special handling for half-precision floating point conversions.
// If we don't have F16C support, then lower half float conversions
// into library calls.
- if (Subtarget.useSoftFloat() ||
- (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
+ if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
}
@@ -4998,6 +4997,8 @@ static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
switch (Opcode) {
default:
return false;
+ case X86ISD::TESTM:
+ case X86ISD::TESTNM:
case X86ISD::PCMPEQM:
case X86ISD::PCMPGTM:
case X86ISD::CMPM:
@@ -6746,6 +6747,9 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Unsupported vector type for broadcast.");
+ BitVector UndefElements;
+ SDValue Ld = BVOp->getSplatValue(&UndefElements);
+
// Attempt to use VBROADCASTM
// From this paterrn:
// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
@@ -6753,17 +6757,23 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
//
// Create (VBROADCASTM v2i1 X)
if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
- MVT EltType;
- unsigned NumElts;
+ MVT EltType = VT.getScalarType();
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue BOperand;
SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
- if (ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) {
- SDValue BOperand = ZeroExtended.getOperand(0);
+ if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
+ (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
+ Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
+ if (ZeroExtended)
+ BOperand = ZeroExtended.getOperand(0);
+ else
+ BOperand = Ld.getOperand(0).getOperand(0);
if (BOperand.getValueType().isVector() &&
BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
- if ((EltType == MVT::i64 &&
- VT.getVectorElementType() == MVT::i8) || // for broadcastmb2q
- (EltType == MVT::i32 &&
- VT.getVectorElementType() == MVT::i16)) { // for broadcastmw2d
+ if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||
+ NumElts == 8)) || // for broadcastmb2q
+ (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||
+ NumElts == 16))) { // for broadcastmw2d
SDValue Brdcst =
DAG.getNode(X86ISD::VBROADCASTM, dl,
MVT::getVectorVT(EltType, NumElts), BOperand);
@@ -6773,9 +6783,6 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
}
}
- BitVector UndefElements;
- SDValue Ld = BVOp->getSplatValue(&UndefElements);
-
// We need a splat of a single value to use broadcast, and it doesn't
// make any sense if the value is only in one element of the vector.
if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
@@ -7707,6 +7714,111 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
+// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
+// reasoned to be a permutation of a vector by indices in a non-constant vector.
+// (build_vector (extract_elt V, (extract_elt I, 0)),
+// (extract_elt V, (extract_elt I, 1)),
+// ...
+// ->
+// (vpermv I, V)
+//
+// TODO: Handle undefs
+// TODO: Utilize pshufb and zero mask blending to support more efficient
+// construction of vectors with constant-0 elements.
+// TODO: Use smaller-element vectors of same width, and "interpolate" the indices,
+// when no native operation available.
+static SDValue
+LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Look for VPERMV and PSHUFB opportunities.
+ MVT VT = V.getSimpleValueType();
+ switch (VT.SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::v16i8:
+ if (!Subtarget.hasSSE3())
+ return SDValue();
+ break;
+ case MVT::v8f32:
+ case MVT::v8i32:
+ if (!Subtarget.hasAVX2())
+ return SDValue();
+ break;
+ case MVT::v4i64:
+ case MVT::v4f64:
+ if (!Subtarget.hasVLX())
+ return SDValue();
+ break;
+ case MVT::v16f32:
+ case MVT::v8f64:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ if (!Subtarget.hasAVX512())
+ return SDValue();
+ break;
+ case MVT::v32i16:
+ if (!Subtarget.hasBWI())
+ return SDValue();
+ break;
+ case MVT::v8i16:
+ case MVT::v16i16:
+ if (!Subtarget.hasVLX() || !Subtarget.hasBWI())
+ return SDValue();
+ break;
+ case MVT::v64i8:
+ if (!Subtarget.hasVBMI())
+ return SDValue();
+ break;
+ case MVT::v32i8:
+ if (!Subtarget.hasVLX() || !Subtarget.hasVBMI())
+ return SDValue();
+ break;
+ }
+ SDValue SrcVec, IndicesVec;
+ // Check for a match of the permute source vector and permute index elements.
+ // This is done by checking that the i-th build_vector operand is of the form:
+ // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
+ for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
+ SDValue Op = V.getOperand(Idx);
+ if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // If this is the first extract encountered in V, set the source vector,
+ // otherwise verify the extract is from the previously defined source
+ // vector.
+ if (!SrcVec)
+ SrcVec = Op.getOperand(0);
+ else if (SrcVec != Op.getOperand(0))
+ return SDValue();
+ SDValue ExtractedIndex = Op->getOperand(1);
+ // Peek through extends.
+ if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
+ ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
+ ExtractedIndex = ExtractedIndex.getOperand(0);
+ if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // If this is the first extract from the index vector candidate, set the
+ // indices vector, otherwise verify the extract is from the previously
+ // defined indices vector.
+ if (!IndicesVec)
+ IndicesVec = ExtractedIndex.getOperand(0);
+ else if (IndicesVec != ExtractedIndex.getOperand(0))
+ return SDValue();
+
+ auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
+ if (!PermIdx || PermIdx->getZExtValue() != Idx)
+ return SDValue();
+ }
+ MVT IndicesVT = VT;
+ if (VT.isFloatingPoint())
+ IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
+ VT.getVectorNumElements());
+ IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
+ return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV,
+ SDLoc(V), VT, IndicesVec, SrcVec);
+}
+
SDValue
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -7922,6 +8034,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (IsAllConstants)
return SDValue();
+ if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
+ return V;
+
// See if we can use a vector load to get all of the elements.
if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
@@ -10716,10 +10831,16 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
- if (Subtarget.hasSSSE3())
+ if (Subtarget.hasSSSE3()) {
+ if (Subtarget.hasVLX())
+ if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ }
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
@@ -11016,10 +11137,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
- if (Subtarget.hasSSSE3())
+ if (Subtarget.hasSSSE3()) {
+ if (Subtarget.hasVLX())
+ if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ }
// Assume that a single SHUFPS is faster than an alternative sequence of
// multiple instructions (even if the CPU has a domain penalty).
@@ -12372,6 +12499,16 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
}
}
+
+ // Try to use SHUF128 if possible.
+ if (Subtarget.hasVLX()) {
+ if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
+ unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
+ ((WidenedMask[1] % 2) << 1);
+ return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+ DAG.getConstant(PermMask, DL, MVT::i8));
+ }
+ }
}
// Otherwise form a 128-bit permutation. After accounting for undefs,
@@ -13697,10 +13834,6 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
- if (SDValue Shuf128 =
- lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
- return Shuf128;
-
if (V2.isUndef()) {
// When the shuffle is mirrored between the 128-bit lanes of the unit, we
// can use lower latency instructions that will operate on all four
@@ -13722,6 +13855,10 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
}
+ if (SDValue Shuf128 =
+ lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ return Shuf128;
+
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -17333,6 +17470,20 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
if (Swap)
std::swap(Op0, Op1);
+
+ // See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM.
+ if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) {
+ SDValue A = peekThroughBitcasts(Op0);
+ if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) &&
+ ISD::isBuildVectorAllZeros(Op1.getNode())) {
+ MVT VT0 = Op0.getSimpleValueType();
+ SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));
+ SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));
+ return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,
+ dl, VT, RHS, LHS);
+ }
+ }
+
if (Opc)
return DAG.getNode(Opc, dl, VT, Op0, Op1);
Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
@@ -19838,10 +19989,19 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
else
PassThru = Src1;
- SDValue Rnd = Op.getOperand(5);
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (!isRoundModeCurDirection(Rnd))
+ return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
+ Op.getValueType(), Src1, Src2,
+ Src3, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
Op.getValueType(), Src1, Src2,
- Src3, Rnd),
+ Src3),
Mask, PassThru, Subtarget, DAG);
}
case IFMA_OP_MASKZ:
@@ -24786,9 +24946,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FMAXC: return "X86ISD::FMAXC";
case X86ISD::FMINC: return "X86ISD::FMINC";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
- case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
case X86ISD::FRCP: return "X86ISD::FRCP";
- case X86ISD::FRCPS: return "X86ISD::FRCPS";
case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
@@ -24942,10 +25100,18 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
+ case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
+ case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
+ case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
+ case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
+ case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
+ case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
+ case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
+ case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
@@ -24966,9 +25132,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SELECT: return "X86ISD::SELECT";
case X86ISD::SELECTS: return "X86ISD::SELECTS";
case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
+ case X86ISD::RCP14: return "X86ISD::RCP14";
+ case X86ISD::RCP14S: return "X86ISD::RCP14S";
case X86ISD::RCP28: return "X86ISD::RCP28";
case X86ISD::RCP28S: return "X86ISD::RCP28S";
case X86ISD::EXP2: return "X86ISD::EXP2";
+ case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
+ case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
@@ -25006,6 +25176,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
+ case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
@@ -30314,10 +30485,17 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
+ if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
return NewOp;
- if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
+ // TODO - Remove this once we can handle the implicit zero-extension of
+ // X86ISD::PEXTRW/X86ISD::PEXTRB in:
+ // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
+ // combineBasicSADPattern.
+ if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
return NewOp;
SDValue InputVector = N->getOperand(0);
@@ -30464,16 +30642,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-// TODO - merge with combineExtractVectorElt once it can handle the implicit
-// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
-// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
-// combineBasicSADPattern.
-static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
- return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
-}
-
/// If a vector select has an operand that is -1 or 0, try to simplify the
/// select to a bitwise logic operation.
/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
@@ -30674,26 +30842,6 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
- case X86ISD::PALIGNR:
- // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
- if (!VT.is128BitVector())
- return false;
- Opcode = X86ISD::VALIGN;
- LLVM_FALLTHROUGH;
- case X86ISD::VALIGN: {
- if (EltVT != MVT::i32 && EltVT != MVT::i64)
- return false;
- uint64_t Imm = Op.getConstantOperandVal(2);
- MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
- unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
- unsigned EltSize = EltVT.getSizeInBits();
- // Make sure we can represent the same shift with the new VT.
- if ((ShiftAmt % EltSize) != 0)
- return false;
- Imm = ShiftAmt / EltSize;
- return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
- DAG.getConstant(Imm, DL, MVT::i8));
- }
case X86ISD::SHUF128: {
if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
return false;
@@ -34441,8 +34589,9 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// This function transforms vector truncation of 'extended sign-bits' values.
-/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
+/// This function transforms vector truncation of 'extended sign-bits' or
+/// 'extended zero-bits' values.
+/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -34475,10 +34624,19 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
unsigned NumSignBits = DAG.ComputeNumSignBits(In);
unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
- if (NumSignBits <= (InSVT.getSizeInBits() - NumPackedBits))
- return SDValue();
+ if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
+ return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+
+ // Use PACKUS if the input has zero-bits that extend all the way to the
+ // packed/truncated value. e.g. masks, zext_in_reg, etc.
+ KnownBits Known;
+ DAG.computeKnownBits(In, Known);
+ unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
+ NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
+ if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
+ return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
- return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+ return SDValue();
}
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
@@ -34507,7 +34665,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
}
- // Try to truncate extended sign bits with PACKSS.
+ // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
return V;
@@ -35341,9 +35499,11 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
// Do not convert the passthru input of scalar intrinsics.
// FIXME: We could allow negations of the lower element only.
- bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
+ bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
+ N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
bool NegB = invertIfNegative(B);
- bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
+ bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
+ N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
// Negative multiplication when NegA xor NegB
bool NegMul = (NegA != NegB);
@@ -35371,6 +35531,20 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
}
+ } else if (N->getOpcode() == X86ISD::FMADDS1) {
+ switch (NewOpcode) {
+ case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
+ }
+ } else if (N->getOpcode() == X86ISD::FMADDS3) {
+ switch (NewOpcode) {
+ case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
+ }
} else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
switch (NewOpcode) {
case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;
@@ -36590,10 +36764,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
switch (N->getOpcode()) {
default: break;
case ISD::EXTRACT_VECTOR_ELT:
- return combineExtractVectorElt(N, DAG, DCI, Subtarget);
case X86ISD::PEXTRW:
case X86ISD::PEXTRB:
- return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
+ return combineExtractVectorElt(N, DAG, DCI, Subtarget);
case ISD::INSERT_SUBVECTOR:
return combineInsertSubvector(N, DAG, DCI, Subtarget);
case ISD::EXTRACT_SUBVECTOR:
@@ -36689,6 +36862,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FMADD_RND:
case X86ISD::FMADDS1_RND:
case X86ISD::FMADDS3_RND:
+ case X86ISD::FMADDS1:
+ case X86ISD::FMADDS3:
case ISD::FMA: return combineFMA(N, DAG, Subtarget);
case ISD::MGATHER:
case ISD::MSCATTER: return combineGatherScatter(N, DAG);
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 17cb976a4c7..d1438e59f9b 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -254,7 +254,9 @@ namespace llvm {
/// Note that these typically require refinement
/// in order to obtain suitable precision.
FRSQRT, FRCP,
- FRSQRTS, FRCPS,
+
+ // AVX-512 reciprocal approximations with a little more precision.
+ RSQRT14, RSQRT14S, RCP14, RCP14S,
// Thread Local Storage.
TLSADDR,
@@ -487,6 +489,12 @@ namespace llvm {
FMADDSUB_RND,
FMSUBADD_RND,
+ // Scalar intrinsic FMA.
+ FMADDS1, FMADDS3,
+ FNMADDS1, FNMADDS3,
+ FMSUBS1, FMSUBS3,
+ FNMSUBS1, FNMSUBS3,
+
// Scalar intrinsic FMA with rounding mode.
// Two versions, passthru bits on op1 or op3.
FMADDS1_RND, FMADDS3_RND,
@@ -555,7 +563,7 @@ namespace llvm {
RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2,
// Conversions between float and half-float.
- CVTPS2PH, CVTPH2PS,
+ CVTPS2PH, CVTPH2PS, CVTPH2PS_RND,
// LWP insert record.
LWPINS,
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index a73ee19423d..84b44ac677b 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -6017,16 +6017,17 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
}
multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
- string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
- SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> {
+ string OpcodeStr, SDNode OpNode, SDNode OpNodes1,
+ SDNode OpNodeRnds1, SDNode OpNodes3,
+ SDNode OpNodeRnds3, X86VectorVTInfo _,
+ string SUFF> {
let ExeDomain = _.ExeDomain in {
defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix, _,
// Operands for intrinsic are in 123 order to preserve passthu
// semantics.
- (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
- (i32 FROUND_CURRENT))),
- (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2,
- _.ScalarIntMemCPat:$src3, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2, _.RC:$src3)),
+ (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2,
+ _.ScalarIntMemCPat:$src3)),
(_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
(i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
@@ -6035,10 +6036,9 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
(_.ScalarLdFrag addr:$src3)))), 0>;
defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix, _,
- (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
- (i32 FROUND_CURRENT))),
- (_.VT (OpNodeRnds3 _.RC:$src2, _.ScalarIntMemCPat:$src3,
- _.RC:$src1, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodes3 _.RC:$src2, _.RC:$src3, _.RC:$src1)),
+ (_.VT (OpNodes3 _.RC:$src2, _.ScalarIntMemCPat:$src3,
+ _.RC:$src1)),
(_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
(i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
@@ -6050,8 +6050,8 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix, _,
(null_frag),
- (_.VT (OpNodeRnds1 _.RC:$src1, _.ScalarIntMemCPat:$src3,
- _.RC:$src2, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodes1 _.RC:$src1, _.ScalarIntMemCPat:$src3,
+ _.RC:$src2)),
(null_frag),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
_.FRC:$src2))),
@@ -6061,26 +6061,29 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
}
multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
- string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
+ string OpcodeStr, SDNode OpNode, SDNode OpNodes1,
+ SDNode OpNodeRnds1, SDNode OpNodes3,
SDNode OpNodeRnds3> {
let Predicates = [HasAVX512] in {
defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
- OpNodeRnds1, OpNodeRnds3, f32x_info, "SS">,
+ OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3,
+ f32x_info, "SS">,
EVEX_CD8<32, CD8VT1>, VEX_LIG;
defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
- OpNodeRnds1, OpNodeRnds3, f64x_info, "SD">,
+ OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3,
+ f64x_info, "SD">,
EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
}
}
-defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnds1,
- X86FmaddRnds3>;
-defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnds1,
- X86FmsubRnds3>;
-defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd,
- X86FnmaddRnds1, X86FnmaddRnds3>;
-defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub,
- X86FnmsubRnds1, X86FnmsubRnds3>;
+defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86Fmadds1,
+ X86FmaddRnds1, X86Fmadds3, X86FmaddRnds3>;
+defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86Fmsubs1,
+ X86FmsubRnds1, X86Fmsubs3, X86FmsubRnds3>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86Fnmadds1,
+ X86FnmaddRnds1, X86Fnmadds3, X86FnmaddRnds3>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86Fnmsubs1,
+ X86FnmsubRnds1, X86Fnmsubs3, X86FnmsubRnds3>;
//===----------------------------------------------------------------------===//
// AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
@@ -6554,7 +6557,7 @@ defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd",
NotMemoryFoldable;
def : Pat<(f64 (fpextend FR32X:$src)),
- (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, FR64X), FR32X:$src)>,
+ (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>,
Requires<[HasAVX512]>;
def : Pat<(f64 (fpextend (loadf32 addr:$src))),
(VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
@@ -6569,7 +6572,7 @@ def : Pat<(f64 (extloadf32 addr:$src)),
Requires<[HasAVX512, OptForSpeed]>;
def : Pat<(f32 (fpround FR64X:$src)),
- (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, FR32X), FR64X:$src)>,
+ (VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>,
Requires<[HasAVX512]>;
def : Pat<(v4f32 (X86Movss
@@ -7174,34 +7177,44 @@ def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))),
//===----------------------------------------------------------------------===//
multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
X86MemOperand x86memop, PatFrag ld_frag> {
- defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
- "vcvtph2ps", "$src", "$src",
- (X86cvtph2ps (_src.VT _src.RC:$src),
- (i32 FROUND_CURRENT))>, T8PD;
- defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src),
- "vcvtph2ps", "$src", "$src",
- (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))),
- (i32 FROUND_CURRENT))>, T8PD;
+ defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst),
+ (ins _src.RC:$src), "vcvtph2ps", "$src", "$src",
+ (X86cvtph2ps (_src.VT _src.RC:$src))>, T8PD;
+ defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
+ (ins x86memop:$src), "vcvtph2ps", "$src", "$src",
+ (X86cvtph2ps (_src.VT
+ (bitconvert
+ (ld_frag addr:$src))))>, T8PD;
}
multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
- defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
- "vcvtph2ps", "{sae}, $src", "$src, {sae}",
- (X86cvtph2ps (_src.VT _src.RC:$src),
- (i32 FROUND_NO_EXC))>, T8PD, EVEX_B;
+ defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst),
+ (ins _src.RC:$src), "vcvtph2ps",
+ "{sae}, $src", "$src, {sae}",
+ (X86cvtph2psRnd (_src.VT _src.RC:$src),
+ (i32 FROUND_NO_EXC))>, T8PD, EVEX_B;
}
-let Predicates = [HasAVX512] in {
+let Predicates = [HasAVX512] in
defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>,
avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>,
EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
- let Predicates = [HasVLX] in {
- defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
- loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
- defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
- loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
- }
+
+let Predicates = [HasVLX] in {
+ defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
+ loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+ defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
+ loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+
+ // Pattern match vcvtph2ps of a scalar i64 load.
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
+ (VCVTPH2PSZ128rm addr:$src)>;
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
+ (VCVTPH2PSZ128rm addr:$src)>;
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+ (VCVTPH2PSZ128rm addr:$src)>;
}
multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
@@ -7212,17 +7225,16 @@ multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
(X86cvtps2ph (_src.VT _src.RC:$src1),
(i32 imm:$src2)),
NoItinerary, 0, 0>, AVX512AIi8Base;
- def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
- (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
- "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1),
- (i32 imm:$src2))),
- addr:$dst)]>;
- let hasSideEffects = 0, mayStore = 1 in
- def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
- (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
- "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
- []>, EVEX_K;
+ let hasSideEffects = 0, mayStore = 1 in {
+ def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>;
+ def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ []>, EVEX_K;
+ }
}
multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
let hasSideEffects = 0 in
@@ -7242,6 +7254,19 @@ let Predicates = [HasAVX512] in {
defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem>,
EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
}
+
+ def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>;
+ def : Pat<(store (i64 (extractelt
+ (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>;
+ def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, i32:$src2)), addr:$dst),
+ (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, imm:$src2)>;
+ def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, i32:$src2)), addr:$dst),
+ (VCVTPS2PHZmr addr:$dst, VR512:$src1, imm:$src2)>;
}
// Patterns for matching conversions from float to half-float and vice versa.
@@ -7264,35 +7289,6 @@ let Predicates = [HasVLX] in {
(VCVTPS2PHZ128rr (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), FR32X)) >;
}
-// Patterns for matching float to half-float conversion when AVX512 is supported
-// but F16C isn't. In that case we have to use 512-bit vectors.
-let Predicates = [HasAVX512, NoVLX, NoF16C] in {
- def : Pat<(fp_to_f16 FR32X:$src),
- (i16 (EXTRACT_SUBREG
- (VMOVPDI2DIZrr
- (v8i16 (EXTRACT_SUBREG
- (VCVTPS2PHZrr
- (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
- (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
- sub_xmm), 4), sub_xmm))), sub_16bit))>;
-
- def : Pat<(f16_to_fp GR16:$src),
- (f32 (COPY_TO_REGCLASS
- (v4f32 (EXTRACT_SUBREG
- (VCVTPH2PSZrr
- (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)),
- (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)),
- sub_xmm)), sub_xmm)), FR32X))>;
-
- def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
- (f32 (COPY_TO_REGCLASS
- (v4f32 (EXTRACT_SUBREG
- (VCVTPH2PSZrr
- (VCVTPS2PHZrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
- (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
- sub_xmm), 4)), sub_xmm)), FR32X))>;
-}
-
// Unordered/Ordered scalar fp compare with Sea and set EFLAGS
multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
string OpcodeStr> {
@@ -7362,13 +7358,13 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
}
-defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>,
+defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, f32x_info>,
EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>,
+defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, f64x_info>,
VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>,
+defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s, f32x_info>,
EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
+defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s, f64x_info>,
VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
@@ -7414,8 +7410,8 @@ multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
}
}
-defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>;
-defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
+defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14>;
+defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14>;
/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
@@ -7582,7 +7578,8 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
}
multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
- string SUFF, SDNode OpNode, SDNode OpNodeRnd> {
+ string SUFF, SDNode OpNode, SDNode OpNodeRnd,
+ Intrinsic Intr> {
let ExeDomain = _.ExeDomain in {
defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
@@ -7618,21 +7615,35 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
}
}
+let Predicates = [HasAVX512] in {
def : Pat<(_.EltVT (OpNode _.FRC:$src)),
(!cast<Instruction>(NAME#SUFF#Zr)
(_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
+ def : Pat<(Intr VR128X:$src),
+ (!cast<Instruction>(NAME#SUFF#Zr_Int) VR128X:$src,
+ VR128X:$src)>;
+}
+
+let Predicates = [HasAVX512, OptForSize] in {
def : Pat<(_.EltVT (OpNode (load addr:$src))),
(!cast<Instruction>(NAME#SUFF#Zm)
- (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>;
+ (_.EltVT (IMPLICIT_DEF)), addr:$src)>;
+
+ def : Pat<(Intr (scalar_to_vector (_.EltVT (load addr:$src2)))),
+ (!cast<Instruction>(NAME#SUFF#Zm_Int)
+ (_.VT (IMPLICIT_DEF)), addr:$src2)>;
+}
+
}
multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> {
defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt,
- X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS,
- NotMemoryFoldable;
+ X86fsqrtRnds, int_x86_sse_sqrt_ss>,
+ EVEX_CD8<32, CD8VT1>, EVEX_4V, XS, NotMemoryFoldable;
defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt,
- X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W,
+ X86fsqrtRnds, int_x86_sse2_sqrt_sd>,
+ EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W,
NotMemoryFoldable;
}
@@ -7641,19 +7652,6 @@ defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>,
defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG;
-let Predicates = [HasAVX512] in {
- def : Pat<(f32 (X86frsqrt FR32X:$src)),
- (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>;
- def : Pat<(f32 (X86frsqrt (load addr:$src))),
- (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
- Requires<[OptForSize]>;
- def : Pat<(f32 (X86frcp FR32X:$src)),
- (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>;
- def : Pat<(f32 (X86frcp (load addr:$src))),
- (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
- Requires<[OptForSize]>;
-}
-
multiclass
avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
@@ -8911,6 +8909,123 @@ defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" ,
avx512vl_i8_info, avx512vl_i8_info>,
EVEX_CD8<8, CD8VF>;
+// Fragments to help convert valignq into masked valignd. Or valignq/valignd
+// into vpalignr.
+def ValignqImm32XForm : SDNodeXForm<imm, [{
+ return getI8Imm(N->getZExtValue() * 2, SDLoc(N));
+}]>;
+def ValignqImm8XForm : SDNodeXForm<imm, [{
+ return getI8Imm(N->getZExtValue() * 8, SDLoc(N));
+}]>;
+def ValigndImm8XForm : SDNodeXForm<imm, [{
+ return getI8Imm(N->getZExtValue() * 4, SDLoc(N));
+}]>;
+
+multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo From, X86VectorVTInfo To,
+ SDNodeXForm ImmXForm> {
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1, From.RC:$src2,
+ imm:$src3))),
+ To.RC:$src0)),
+ (!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask,
+ To.RC:$src1, To.RC:$src2,
+ (ImmXForm imm:$src3))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1, From.RC:$src2,
+ imm:$src3))),
+ To.ImmAllZerosV)),
+ (!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask,
+ To.RC:$src1, To.RC:$src2,
+ (ImmXForm imm:$src3))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert (To.LdFrag addr:$src2)),
+ imm:$src3))),
+ To.RC:$src0)),
+ (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm imm:$src3))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert (To.LdFrag addr:$src2)),
+ imm:$src3))),
+ To.ImmAllZerosV)),
+ (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm imm:$src3))>;
+}
+
+multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo From,
+ X86VectorVTInfo To,
+ SDNodeXForm ImmXForm> :
+ avx512_vpalign_mask_lowering<OpcodeStr, OpNode, From, To, ImmXForm> {
+ def : Pat<(From.VT (OpNode From.RC:$src1,
+ (bitconvert (To.VT (X86VBroadcast
+ (To.ScalarLdFrag addr:$src2)))),
+ imm:$src3)),
+ (!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2,
+ (ImmXForm imm:$src3))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert
+ (To.VT (X86VBroadcast
+ (To.ScalarLdFrag addr:$src2)))),
+ imm:$src3))),
+ To.RC:$src0)),
+ (!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm imm:$src3))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert
+ (To.VT (X86VBroadcast
+ (To.ScalarLdFrag addr:$src2)))),
+ imm:$src3))),
+ To.ImmAllZerosV)),
+ (!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm imm:$src3))>;
+}
+
+let Predicates = [HasAVX512] in {
+ // For 512-bit we lower to the widest element type we can. So we only need
+ // to handle converting valignq to valignd.
+ defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ", X86VAlign, v8i64_info,
+ v16i32_info, ValignqImm32XForm>;
+}
+
+let Predicates = [HasVLX] in {
+ // For 128-bit we lower to the widest element type we can. So we only need
+ // to handle converting valignq to valignd.
+ defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ128", X86VAlign, v2i64x_info,
+ v4i32x_info, ValignqImm32XForm>;
+ // For 256-bit we lower to the widest element type we can. So we only need
+ // to handle converting valignq to valignd.
+ defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ256", X86VAlign, v4i64x_info,
+ v8i32x_info, ValignqImm32XForm>;
+}
+
+let Predicates = [HasVLX, HasBWI] in {
+ // We can turn 128 and 256 bit VALIGND/VALIGNQ into VPALIGNR.
+ defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v2i64x_info,
+ v16i8x_info, ValignqImm8XForm>;
+ defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v4i32x_info,
+ v16i8x_info, ValigndImm8XForm>;
+}
+
defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>;
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 453dcd83df1..15466c2978f 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -290,8 +290,7 @@ multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
}
multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
- string OpStr, Intrinsic IntF32, Intrinsic IntF64,
- SDNode OpNode> {
+ string OpStr, SDNode OpNodeIntrin, SDNode OpNode> {
let ExeDomain = SSEPackedSingle in
defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode,
FR32, f32mem>,
@@ -309,43 +308,44 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
// This is because src1 is tied to dest, and the scalar intrinsics
// require the pass-through values to come from the first source
// operand, not the second.
- // TODO: Use AVX512 instructions when possible.
- let Predicates = [HasFMA] in {
- def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
+ let Predicates = [HasFMA, NoAVX512] in {
+ def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)),
(!cast<Instruction>(NAME#"213SSr_Int")
VR128:$src1, VR128:$src2, VR128:$src3)>;
- def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
+ def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)),
(!cast<Instruction>(NAME#"213SDr_Int")
VR128:$src1, VR128:$src2, VR128:$src3)>;
- def : Pat<(IntF32 VR128:$src1, VR128:$src2, sse_load_f32:$src3),
+ def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2,
+ sse_load_f32:$src3)),
(!cast<Instruction>(NAME#"213SSm_Int")
VR128:$src1, VR128:$src2, sse_load_f32:$src3)>;
- def : Pat<(IntF64 VR128:$src1, VR128:$src2, sse_load_f64:$src3),
+ def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2,
+ sse_load_f64:$src3)),
(!cast<Instruction>(NAME#"213SDm_Int")
VR128:$src1, VR128:$src2, sse_load_f64:$src3)>;
- def : Pat<(IntF32 VR128:$src1, sse_load_f32:$src3, VR128:$src2),
+ def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, sse_load_f32:$src3,
+ VR128:$src2)),
(!cast<Instruction>(NAME#"132SSm_Int")
VR128:$src1, VR128:$src2, sse_load_f32:$src3)>;
- def : Pat<(IntF64 VR128:$src1, sse_load_f64:$src3, VR128:$src2),
+ def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, sse_load_f64:$src3,
+ VR128:$src2)),
(!cast<Instruction>(NAME#"132SDm_Int")
VR128:$src1, VR128:$src2, sse_load_f64:$src3)>;
}
}
-defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
- int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
-defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
- int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86Fmadds1, X86Fmadd>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsubs1, X86Fmsub>, VEX_LIG;
-defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
- int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
-defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
- int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadds1, X86Fnmadd>,
+ VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsubs1, X86Fnmsub>,
+ VEX_LIG;
//===----------------------------------------------------------------------===//
@@ -385,26 +385,28 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
}
multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
- ComplexPattern mem_cpat, Intrinsic Int> {
+ ValueType VT, ComplexPattern mem_cpat, SDNode OpNode> {
let isCodeGenOnly = 1 in {
def rr_Int : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG;
+ (VT (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, VEX_W,
+ VEX_LIG;
def rm_Int : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
- mem_cpat:$src3))]>, VEX_W, VEX_LIG;
+ [(set VR128:$dst, (VT (OpNode VR128:$src1, VR128:$src2,
+ mem_cpat:$src3)))]>, VEX_W, VEX_LIG;
def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, memop:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
+ (VT (OpNode VR128:$src1, mem_cpat:$src2, VR128:$src3)))]>,
+ VEX_LIG;
let hasSideEffects = 0 in
def rr_Int_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
@@ -475,19 +477,19 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
let ExeDomain = SSEPackedSingle in {
// Scalar Instructions
defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
- fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
- int_x86_fma_vfmadd_ss>;
+ fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, sse_load_f32,
+ X86Fmadds1>;
defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
- fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
- int_x86_fma_vfmsub_ss>;
+ fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, sse_load_f32,
+ X86Fmsubs1>;
defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
X86Fnmadd, loadf32>,
- fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
- int_x86_fma_vfnmadd_ss>;
+ fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, sse_load_f32,
+ X86Fnmadds1>;
defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
X86Fnmsub, loadf32>,
- fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
- int_x86_fma_vfnmsub_ss>;
+ fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, sse_load_f32,
+ X86Fnmsubs1>;
// Packed Instructions
defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
loadv4f32, loadv8f32>;
@@ -506,19 +508,19 @@ let ExeDomain = SSEPackedSingle in {
let ExeDomain = SSEPackedDouble in {
// Scalar Instructions
defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
- fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
- int_x86_fma_vfmadd_sd>;
+ fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, sse_load_f64,
+ X86Fmadds1>;
defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
- fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
- int_x86_fma_vfmsub_sd>;
+ fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, sse_load_f64,
+ X86Fmsubs1>;
defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
X86Fnmadd, loadf64>,
- fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
- int_x86_fma_vfnmadd_sd>;
+ fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, sse_load_f64,
+ X86Fnmadds1>;
defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
X86Fnmsub, loadf64>,
- fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
- int_x86_fma_vfnmsub_sd>;
+ fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, sse_load_f64,
+ X86Fnmsubs1>;
// Packed Instructions
defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
loadv2f64, loadv4f64>;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index c4f34bdd37e..d30400836bb 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -56,8 +56,6 @@ def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp,
def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp>;
def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
-def X86frsqrt14s: SDNode<"X86ISD::FRSQRTS", SDTFPBinOp>;
-def X86frcp14s : SDNode<"X86ISD::FRCPS", SDTFPBinOp>;
def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
@@ -482,11 +480,22 @@ def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound, [SDNPCommutat
def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound, [SDNPCommutative]>;
// Scalar FMA intrinsics with passthru bits in operand 1.
+def X86Fmadds1 : SDNode<"X86ISD::FMADDS1", SDTFPTernaryOp>;
+def X86Fnmadds1 : SDNode<"X86ISD::FNMADDS1", SDTFPTernaryOp>;
+def X86Fmsubs1 : SDNode<"X86ISD::FMSUBS1", SDTFPTernaryOp>;
+def X86Fnmsubs1 : SDNode<"X86ISD::FNMSUBS1", SDTFPTernaryOp>;
+
+// Scalar FMA intrinsics with passthru bits in operand 1.
def X86FmaddRnds1 : SDNode<"X86ISD::FMADDS1_RND", SDTFmaRound>;
def X86FnmaddRnds1 : SDNode<"X86ISD::FNMADDS1_RND", SDTFmaRound>;
def X86FmsubRnds1 : SDNode<"X86ISD::FMSUBS1_RND", SDTFmaRound>;
def X86FnmsubRnds1 : SDNode<"X86ISD::FNMSUBS1_RND", SDTFmaRound>;
+def X86Fmadds3 : SDNode<"X86ISD::FMADDS3", SDTFPTernaryOp>;
+def X86Fnmadds3 : SDNode<"X86ISD::FNMADDS3", SDTFPTernaryOp>;
+def X86Fmsubs3 : SDNode<"X86ISD::FMSUBS3", SDTFPTernaryOp>;
+def X86Fnmsubs3 : SDNode<"X86ISD::FNMSUBS3", SDTFPTernaryOp>;
+
// Scalar FMA intrinsics with passthru bits in operand 3.
def X86FmaddRnds3 : SDNode<"X86ISD::FMADDS3_RND", SDTFmaRound, [SDNPCommutative]>;
def X86FnmaddRnds3 : SDNode<"X86ISD::FNMADDS3_RND", SDTFmaRound, [SDNPCommutative]>;
@@ -498,10 +507,14 @@ def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>;
def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma, [SDNPCommutative]>;
+def X86rsqrt14 : SDNode<"X86ISD::RSQRT14", SDTFPUnaryOp>;
+def X86rcp14 : SDNode<"X86ISD::RCP14", SDTFPUnaryOp>;
def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>;
def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>;
def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOpRound>;
+def X86rsqrt14s : SDNode<"X86ISD::RSQRT14S", SDTFPBinOp>;
+def X86rcp14s : SDNode<"X86ISD::RCP14S", SDTFPBinOp>;
def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOpRound>;
def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOpRound>;
def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImmRound>;
@@ -578,7 +591,12 @@ def X86VUintToFP : SDNode<"X86ISD::CVTUI2P", SDTVintToFP>;
def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>;
def X86cvtp2UInt : SDNode<"X86ISD::CVTP2UI", SDTFloatToInt>;
+
def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, i16>]> >;
+
+def X86cvtph2psRnd : SDNode<"X86ISD::CVTPH2PS_RND",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
SDTCVecEltisVT<1, i16>,
SDTCisVT<2, i32>]> >;
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index e665ec1f14d..02a09c340ce 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -18,7 +18,7 @@
#include "X86InstrFMA3Info.h"
#include "X86RegisterInfo.h"
#include "llvm/ADT/DenseMap.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "X86GenInstrInfo.inc"
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 559a8fcf107..f00caa130d0 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -850,7 +850,6 @@ def HasLWP : Predicate<"Subtarget->hasLWP()">;
def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">;
def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">;
def HasF16C : Predicate<"Subtarget->hasF16C()">;
-def NoF16C : Predicate<"!Subtarget->hasF16C()">;
def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">;
def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">;
def HasBMI : Predicate<"Subtarget->hasBMI()">;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 451303054f5..955a40ee171 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1514,13 +1514,12 @@ let mayLoad = 1 in
def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
(ins FR32:$src1, f64mem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], IIC_SSE_CVT_Scalar_RM>,
- XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
+ [], IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_LIG,
Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable;
}
def : Pat<(f32 (fpround FR64:$src)),
- (VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>,
+ (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
Requires<[UseAVX]>;
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
@@ -1574,20 +1573,18 @@ let hasSideEffects = 0, Predicates = [UseAVX] in {
def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
(ins FR64:$src1, FR32:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], IIC_SSE_CVT_Scalar_RR>,
- XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
+ [], IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_LIG,
Sched<[WriteCvtF2F]>, VEX_WIG, NotMemoryFoldable;
let mayLoad = 1 in
def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
(ins FR64:$src1, f32mem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], IIC_SSE_CVT_Scalar_RM>,
- XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
+ [], IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_LIG,
Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable;
}
def : Pat<(f64 (fpextend FR32:$src)),
- (VCVTSS2SDrr (COPY_TO_REGCLASS FR32:$src, FR64), FR32:$src)>, Requires<[UseAVX]>;
+ (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
def : Pat<(fpextend (loadf32 addr:$src)),
(VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
@@ -1899,7 +1896,7 @@ let Predicates = [HasAVX, NoVLX] in {
(v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
(VCVTTPD2DQrm addr:$src)>;
}
-} // Predicates = [HasAVX]
+} // Predicates = [HasAVX, NoVLX]
def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
@@ -3095,7 +3092,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType vt, ValueType ScalarVT,
X86MemOperand x86memop,
Intrinsic Intr, SDNode OpNode, Domain d,
- OpndItins itins, string Suffix> {
+ OpndItins itins, Predicate target, string Suffix> {
let hasSideEffects = 0 in {
def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -3126,21 +3123,17 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
// vrcpss mem, %xmm0, %xmm0
// TODO: In theory, we could fold the load, and avoid the stall caused by
// the partial register store, either in ExecutionDepsFix or with smarter RA.
- let Predicates = [UseAVX] in {
+ let Predicates = [target] in {
def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r)
(ScalarVT (IMPLICIT_DEF)), RC:$src)>;
- }
- let Predicates = [HasAVX] in {
def : Pat<(Intr VR128:$src),
(!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src,
VR128:$src)>;
}
- let Predicates = [HasAVX, OptForSize] in {
+ let Predicates = [target, OptForSize] in {
def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
(!cast<Instruction>("V"#NAME#Suffix##m_Int)
(vt (IMPLICIT_DEF)), addr:$src2)>;
- }
- let Predicates = [UseAVX, OptForSize] in {
def : Pat<(ScalarVT (OpNode (load addr:$src))),
(!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
addr:$src)>;
@@ -3186,7 +3179,7 @@ let Predicates = prds in {
/// sse2_fp_unop_p - SSE2 unops in vector forms.
multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
SDNode OpNode, OpndItins itins> {
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat("v", OpcodeStr,
"pd\t{$src, $dst|$dst, $src}"),
@@ -3220,41 +3213,41 @@ let Predicates = [HasAVX] in {
}
multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
+ OpndItins itins, Predicate AVXTarget> {
defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
SSEPackedSingle, itins, UseSSE1, "SS">, XS;
defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
f32mem,
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
- SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG, VEX_WIG,
- NotMemoryFoldable;
+ SSEPackedSingle, itins, AVXTarget, "SS">, XS, VEX_4V,
+ VEX_LIG, VEX_WIG, NotMemoryFoldable;
}
multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
+ OpndItins itins, Predicate AVXTarget> {
defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
f64mem,
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
- OpNode, SSEPackedDouble, itins, "SD">,
+ OpNode, SSEPackedDouble, itins, AVXTarget, "SD">,
XD, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
}
// Square root.
-defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
- sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>,
- sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
+defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS, UseAVX>,
+ sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX, NoVLX]>,
+ sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD, UseAVX>,
sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
// Reciprocal approximations. Note that these typically require refinement
// in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
- sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >;
-defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
- sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>;
+defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS, HasAVX>,
+ sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX]>;
+defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS, HasAVX>,
+ sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX]>;
// There is no f64 version of the reciprocal approximation instructions.
@@ -7692,22 +7685,24 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
//===----------------------------------------------------------------------===//
// Half precision conversion instructions
//===----------------------------------------------------------------------===//
-multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop> {
def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
"vcvtph2ps\t{$src, $dst|$dst, $src}",
- [(set RC:$dst, (Int VR128:$src))]>,
+ [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
T8PD, VEX, Sched<[WriteCvtF2F]>;
let hasSideEffects = 0, mayLoad = 1 in
def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
- "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
- Sched<[WriteCvtF2FLd]>;
+ "vcvtph2ps\t{$src, $dst|$dst, $src}",
+ [(set RC:$dst, (X86cvtph2ps (bc_v8i16
+ (loadv2i64 addr:$src))))]>,
+ T8PD, VEX, Sched<[WriteCvtF2FLd]>;
}
-multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop> {
def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
(ins RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
+ [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>,
TAPD, VEX, Sched<[WriteCvtF2F]>;
let hasSideEffects = 0, mayStore = 1,
SchedRW = [WriteCvtF2FLd, WriteRMW] in
@@ -7717,32 +7712,31 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
TAPD, VEX;
}
-let Predicates = [HasF16C] in {
- defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
- defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
- defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
- defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
+let Predicates = [HasF16C, NoVLX] in {
+ defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem>;
+ defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem>, VEX_L;
+ defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem>;
+ defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem>, VEX_L;
// Pattern match vcvtph2ps of a scalar i64 load.
- def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
(VCVTPH2PSrm addr:$src)>;
- def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
(VCVTPH2PSrm addr:$src)>;
- def : Pat<(int_x86_vcvtph2ps_128 (bitconvert
- (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
(VCVTPH2PSrm addr:$src)>;
- def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
- (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
- addr:$dst),
- (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
- def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16
- (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
- addr:$dst),
- (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
- def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)),
- addr:$dst),
- (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
+ def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+ def : Pat<(store (i64 (extractelt
+ (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+ def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst),
+ (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
}
// Patterns for matching conversions from float to half-float and vice versa.
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 6f5e41bcdc6..9edac22d5ba 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -422,12 +422,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
@@ -1077,12 +1071,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FSUBS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FSUBS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTPH2PS, 0),
- X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
- X86ISD::CVTPH2PS, 0),
- X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTPH2PS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_RND),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK,
X86ISD::CVTPS2PH, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK,
@@ -1098,8 +1092,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, ISD::FMA,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
@@ -1220,8 +1214,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, ISD::FMA,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
@@ -1239,8 +1233,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
X86ISD::FMSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
@@ -1259,8 +1253,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
X86ISD::FNMSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
@@ -1298,8 +1292,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, ISD::FMA,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
@@ -1428,26 +1422,26 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_ptestnm_w_128, CMP_MASK, X86ISD::TESTNM, 0),
X86_INTRINSIC_DATA(avx512_ptestnm_w_256, CMP_MASK, X86ISD::TESTNM, 0),
X86_INTRINSIC_DATA(avx512_ptestnm_w_512, CMP_MASK, X86ISD::TESTNM, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
@@ -1468,6 +1462,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, ISD::FMA, 0),
X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, ISD::FMA, 0),
X86_INTRINSIC_DATA(fma_vfmadd_ps_256, INTR_TYPE_3OP, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADDS1, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADDS1, 0),
X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
@@ -1476,6 +1472,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(fma_vfmsub_pd_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(fma_vfmsub_ps, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(fma_vfmsub_ps_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmsub_sd, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0),
+ X86_INTRINSIC_DATA(fma_vfmsub_ss, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0),
X86_INTRINSIC_DATA(fma_vfmsubadd_pd, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(fma_vfmsubadd_ps, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
@@ -1484,10 +1482,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(fma_vfnmadd_pd_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
X86_INTRINSIC_DATA(fma_vfnmadd_ps, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
X86_INTRINSIC_DATA(fma_vfnmadd_ps_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfnmadd_sd, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0),
+ X86_INTRINSIC_DATA(fma_vfnmadd_ss, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_pd, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_pd_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_ps, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfnmsub_sd, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
+ X86_INTRINSIC_DATA(fma_vfnmsub_ss, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0),
X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE),
@@ -1584,6 +1586,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(vcvtph2ps_128, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
+ X86_INTRINSIC_DATA(vcvtph2ps_256, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
+ X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
+ X86_INTRINSIC_DATA(vcvtps2ph_256, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 98b4863134e..b1438bf7bc0 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -22,6 +22,38 @@
using namespace llvm;
using namespace TargetOpcode;
+/// FIXME: The following static functions are SizeChangeStrategy functions
+/// that are meant to temporarily mimic the behaviour of the old legalization
+/// based on doubling/halving non-legal types as closely as possible. This is
+/// not entirly possible as only legalizing the types that are exactly a power
+/// of 2 times the size of the legal types would require specifying all those
+/// sizes explicitly.
+/// In practice, not specifying those isn't a problem, and the below functions
+/// should disappear quickly as we add support for legalizing non-power-of-2
+/// sized types further.
+static void
+addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
+ const LegalizerInfo::SizeAndActionsVec &v) {
+ for (unsigned i = 0; i < v.size(); ++i) {
+ result.push_back(v[i]);
+ if (i + 1 < v[i].first && i + 1 < v.size() &&
+ v[i + 1].first != v[i].first + 1)
+ result.push_back({v[i].first + 1, LegalizerInfo::Unsupported});
+ }
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1(const LegalizerInfo::SizeAndActionsVec &v) {
+ assert(v.size() >= 1);
+ assert(v[0].first > 1);
+ LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::WidenScalar},
+ {2, LegalizerInfo::Unsupported}};
+ addAndInterleaveWithUnsupported(result, v);
+ auto Largest = result.back().first;
+ result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+ return result;
+}
+
X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
const X86TargetMachine &TM)
: Subtarget(STI), TM(TM) {
@@ -37,6 +69,17 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
setLegalizerInfoAVX512DQ();
setLegalizerInfoAVX512BW();
+ setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1);
+ for (unsigned BinOp : {G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+ setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1);
+ for (unsigned MemOp : {G_LOAD, G_STORE})
+ setLegalizeScalarToDifferentSizeStrategy(MemOp, 0,
+ narrowToSmallerAndWidenToSmallest);
+ setLegalizeScalarToDifferentSizeStrategy(
+ G_GEP, 1, widenToLargerTypesUnsupportedOtherwise);
+ setLegalizeScalarToDifferentSizeStrategy(
+ G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest);
+
computeTables();
}
@@ -47,7 +90,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
const LLT s8 = LLT::scalar(8);
const LLT s16 = LLT::scalar(16);
const LLT s32 = LLT::scalar(32);
- const LLT s64 = LLT::scalar(64);
for (auto Ty : {p0, s1, s8, s16, s32})
setAction({G_IMPLICIT_DEF, Ty}, Legal);
@@ -55,15 +97,10 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
for (auto Ty : {s8, s16, s32, p0})
setAction({G_PHI, Ty}, Legal);
- setAction({G_PHI, s1}, WidenScalar);
-
- for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) {
+ for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
for (auto Ty : {s8, s16, s32})
setAction({BinOp, Ty}, Legal);
- setAction({BinOp, s1}, WidenScalar);
- }
-
for (unsigned Op : {G_UADDE}) {
setAction({Op, s32}, Legal);
setAction({Op, 1, s1}, Legal);
@@ -73,7 +110,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
for (auto Ty : {s8, s16, s32, p0})
setAction({MemOp, Ty}, Legal);
- setAction({MemOp, s1}, WidenScalar);
// And everything's fine in addrspace 0.
setAction({MemOp, 1, p0}, Legal);
}
@@ -85,9 +121,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
setAction({G_GEP, p0}, Legal);
setAction({G_GEP, 1, s32}, Legal);
- for (auto Ty : {s1, s8, s16})
- setAction({G_GEP, 1, Ty}, WidenScalar);
-
// Control-flow
setAction({G_BRCOND, s1}, Legal);
@@ -95,9 +128,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
for (auto Ty : {s8, s16, s32, p0})
setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
- setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar);
- setAction({TargetOpcode::G_CONSTANT, s64}, NarrowScalar);
-
// Extensions
for (auto Ty : {s8, s16, s32}) {
setAction({G_ZEXT, Ty}, Legal);
@@ -105,12 +135,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
setAction({G_ANYEXT, Ty}, Legal);
}
- for (auto Ty : {s1, s8, s16}) {
- setAction({G_ZEXT, 1, Ty}, Legal);
- setAction({G_SEXT, 1, Ty}, Legal);
- setAction({G_ANYEXT, 1, Ty}, Legal);
- }
-
// Comparison
setAction({G_ICMP, s1}, Legal);
@@ -123,7 +147,6 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
if (!Subtarget.is64Bit())
return;
- const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
setAction({G_IMPLICIT_DEF, s64}, Legal);
@@ -145,7 +168,6 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
// Extensions
for (unsigned extOp : {G_ZEXT, G_SEXT, G_ANYEXT}) {
setAction({extOp, s64}, Legal);
- setAction({extOp, 1, s32}, Legal);
}
// Comparison
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
index 0dd13077c37..67d95c2233d 100644
--- a/lib/Target/X86/X86MacroFusion.cpp
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -14,8 +14,8 @@
#include "X86MacroFusion.h"
#include "X86Subtarget.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index 3069d1fd349..9b7732c1db8 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -23,10 +23,10 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 1f49650340e..efa0cd2c6bc 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -27,14 +27,14 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index b0ce1335bd3..66fea1e688f 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -161,6 +161,8 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
// In Regcall calling convention those registers are used for passing
// parameters. Thus we need to prevent lazy binding in Regcall.
return X86II::MO_GOTPCREL;
+ if (F && F->hasFnAttribute(Attribute::NonLazyBind) && is64Bit())
+ return X86II::MO_GOTPCREL;
return X86II::MO_PLT;
}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index a8d7f290688..a21d068c7f4 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -463,7 +463,7 @@ public:
bool hasPCLMUL() const { return HasPCLMUL; }
// Prefer FMA4 to FMA - its better for commutation/memory folding and
// has equal or better performance on all supported targets.
- bool hasFMA() const { return (HasFMA || hasAVX512()) && !HasFMA4; }
+ bool hasFMA() const { return HasFMA && !HasFMA4; }
bool hasFMA4() const { return HasFMA4; }
bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
bool hasXOP() const { return HasXOP; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 6e6c724eb0a..11fe84f162d 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -436,4 +436,11 @@ void X86PassConfig::addPreEmitPass() {
addPass(createX86FixupLEAs());
addPass(createX86EvexToVexInsts());
}
+
+ // Verify basic block incoming and outgoing cfa offset and register values and
+ // correct CFA calculation rule where needed by inserting appropriate CFI
+ // instructions.
+ const Triple &TT = TM->getTargetTriple();
+ if (!TT.isOSDarwin() && !TT.isOSWindows())
+ addPass(createCFIInstrInserter());
}
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index effbd07fa31..6772d96c799 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1437,7 +1437,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return Entry->Cost;
}
- return BaseT::getCastInstrCost(Opcode, Dst, Src);
+ return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
}
int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
@@ -2644,12 +2644,15 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
{ 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
{ 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
{ 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
+ { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
{ 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
{ 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
{ 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
{ 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
- { 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8
+ { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
+
+ { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
};
static const CostTblEntry AVX2InterleavedStoreTbl[] = {
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index fb8c2a71c9a..ba01f1e25ba 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -26,13 +26,13 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include <cassert>
diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp
index fc08f1582ad..8a186e94d9c 100644
--- a/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -25,9 +25,9 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h
index 27584f4e2b6..e98e9cda11d 100644
--- a/lib/Target/XCore/XCoreFrameLowering.h
+++ b/lib/Target/XCore/XCoreFrameLowering.h
@@ -15,7 +15,7 @@
#ifndef LLVM_LIB_TARGET_XCORE_XCOREFRAMELOWERING_H
#define LLVM_LIB_TARGET_XCORE_XCOREFRAMELOWERING_H
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index a377784caf4..9d9ee33ce22 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -15,7 +15,7 @@
#define LLVM_LIB_TARGET_XCORE_XCOREINSTRINFO_H
#include "XCoreRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "XCoreGenInstrInfo.inc"
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index d34e928b14f..a6cf6837009 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -30,7 +30,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 12090bff381..4bb2984e3b4 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -448,9 +448,13 @@ static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV,
for (auto *GVE : GVs) {
DIVariable *Var = GVE->getVariable();
DIExpression *Expr = GVE->getExpression();
- if (NumElements > 1)
- Expr = DIExpression::createFragmentExpression(Expr, FragmentOffsetInBits,
- FragmentSizeInBits);
+ if (NumElements > 1) {
+ if (auto E = DIExpression::createFragmentExpression(
+ Expr, FragmentOffsetInBits, FragmentSizeInBits))
+ Expr = *E;
+ else
+ return;
+ }
auto *NGVE = DIGlobalVariableExpression::get(GVE->getContext(), Var, Expr);
NGV->addDebugInfo(NGVE);
}
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index 9fa5ed9ab2b..6cef866b7b8 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1401,7 +1401,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
FAlias->takeName(F);
if (FAlias->hasName())
F->setName(FAlias->getName() + ".cfi");
- F->replaceAllUsesWith(FAlias);
+ F->replaceUsesExceptBlockAddr(FAlias);
}
if (!F->isDeclarationForLinker())
F->setLinkage(GlobalValue::InternalLinkage);
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index b5267f75e41..c47d8b78df3 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -931,15 +931,17 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE))
continue;
- ORE.emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined",
- CS.getInstruction())
- << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
- << ore::NV("Caller", CS.getCaller());
- });
+ // Construct remark before doing the inlining, as after successful inlining
+ // the callsite is removed.
+ OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
+ OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
+ << ore::NV("Caller", CS.getCaller());
InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
- InlineFunction(CS, IFI);
+ if (!InlineFunction(CS, IFI))
+ continue;
+
+ ORE.emit(OR);
// Now update the entry count:
if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 828eb5eee29..5d373665509 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -467,6 +467,9 @@ void PassManagerBuilder::populateModulePassManager(
addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
+ if (OptLevel > 2)
+ MPM.add(createCallSiteSplittingPass());
+
MPM.add(createIPSCCPPass()); // IP SCCP
MPM.add(createCalledValuePropagationPass());
MPM.add(createGlobalOptimizerPass()); // Optimize out global vars
@@ -545,6 +548,9 @@ void PassManagerBuilder::populateModulePassManager(
// unrolling/vectorization/... now. We'll first run the inliner + CGSCC passes
// during ThinLTO and perform the rest of the optimizations afterward.
if (PrepareForThinLTO) {
+ // Ensure we perform any last passes, but do so before renaming anonymous
+ // globals in case the passes add any.
+ addExtensionsToPM(EP_OptimizerLast, MPM);
// Rename anon globals to be able to export them in the summary.
MPM.add(createNameAnonGlobalPass());
return;
@@ -703,6 +709,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
PM.add(createInferFunctionAttrsLegacyPass());
if (OptLevel > 1) {
+ // Split call-site with more constrained arguments.
+ PM.add(createCallSiteSplittingPass());
+
// Indirect call promotion. This should promote all the targets that are
// left by the earlier promotion pass that promotes intra-module targets.
// This two-step promotion is to save the compile time. For LTO, it should
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index 34414f96cca..8930e9b2b95 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -1182,24 +1182,20 @@ void SampleProfileLoader::buildEdges(Function &F) {
}
}
-/// Sorts the CallTargetMap \p M by count in descending order and stores the
-/// sorted result in \p Sorted. Returns the total counts.
-static uint64_t SortCallTargets(SmallVector<InstrProfValueData, 2> &Sorted,
- const SampleRecord::CallTargetMap &M) {
- Sorted.clear();
- uint64_t Sum = 0;
- for (auto I = M.begin(); I != M.end(); ++I) {
- Sum += I->getValue();
- Sorted.push_back({Function::getGUID(I->getKey()), I->getValue()});
- }
- std::sort(Sorted.begin(), Sorted.end(),
+/// Returns the sorted CallTargetMap \p M by count in descending order.
+static SmallVector<InstrProfValueData, 2> SortCallTargets(
+ const SampleRecord::CallTargetMap &M) {
+ SmallVector<InstrProfValueData, 2> R;
+ for (auto I = M.begin(); I != M.end(); ++I)
+ R.push_back({Function::getGUID(I->getKey()), I->getValue()});
+ std::sort(R.begin(), R.end(),
[](const InstrProfValueData &L, const InstrProfValueData &R) {
if (L.Count == R.Count)
return L.Value > R.Value;
else
return L.Count > R.Count;
});
- return Sum;
+ return R;
}
/// \brief Propagate weights into edges
@@ -1292,8 +1288,10 @@ void SampleProfileLoader::propagateWeights(Function &F) {
auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
if (!T || T.get().empty())
continue;
- SmallVector<InstrProfValueData, 2> SortedCallTargets;
- uint64_t Sum = SortCallTargets(SortedCallTargets, T.get());
+ SmallVector<InstrProfValueData, 2> SortedCallTargets =
+ SortCallTargets(T.get());
+ uint64_t Sum;
+ findIndirectCallFunctionSamples(I, Sum);
annotateValueSite(*I.getParent()->getParent()->getParent(), I,
SortedCallTargets, Sum, IPVK_IndirectCallTarget,
SortedCallTargets.size());
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 18b246b5d99..d28d615f47e 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -482,7 +482,7 @@ Value *FAddCombine::performFactorization(Instruction *I) {
return nullptr;
FastMathFlags Flags;
- Flags.setUnsafeAlgebra();
+ Flags.setFast();
if (I0) Flags &= I->getFastMathFlags();
if (I1) Flags &= I->getFastMathFlags();
@@ -511,7 +511,7 @@ Value *FAddCombine::performFactorization(Instruction *I) {
}
Value *FAddCombine::simplify(Instruction *I) {
- assert(I->hasUnsafeAlgebra() && "Should be in unsafe mode");
+ assert(I->isFast() && "Expected 'fast' instruction");
// Currently we are not able to handle vector type.
if (I->getType()->isVectorTy())
@@ -1386,7 +1386,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
if (Value *V = SimplifySelectsFeedingBinaryOp(I, LHS, RHS))
return replaceInstUsesWith(I, V);
- if (I.hasUnsafeAlgebra()) {
+ if (I.isFast()) {
if (Value *V = FAddCombine(Builder).simplify(&I))
return replaceInstUsesWith(I, V);
}
@@ -1736,7 +1736,7 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
return replaceInstUsesWith(I, V);
- if (I.hasUnsafeAlgebra()) {
+ if (I.isFast()) {
if (Value *V = FAddCombine(Builder).simplify(&I))
return replaceInstUsesWith(I, V);
}
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 7a4abc9aca0..a00e6f73ab8 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2017,7 +2017,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
}
case Intrinsic::fmuladd: {
// Canonicalize fast fmuladd to the separate fmul + fadd.
- if (II->hasUnsafeAlgebra()) {
+ if (II->isFast()) {
BuilderTy::FastMathFlagGuard Guard(Builder);
Builder.setFastMathFlags(II->getFastMathFlags());
Value *Mul = Builder.CreateFMul(II->getArgOperand(0),
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index cb4788576c5..2974449830d 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -426,8 +426,7 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
// Look for an appropriate type:
// - The type of Idx if the magic fits
- // - The smallest fitting legal type if we have a DataLayout
- // - Default to i32
+ // - The smallest fitting legal type
if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth())
Ty = Idx->getType();
else
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index e6b97538267..87666360c1a 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -487,7 +487,7 @@ static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) {
IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op);
if (!II)
return;
- if (II->getIntrinsicID() != Intrinsic::log2 || !II->hasUnsafeAlgebra())
+ if (II->getIntrinsicID() != Intrinsic::log2 || !II->isFast())
return;
Log2 = II;
@@ -498,7 +498,8 @@ static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) {
Instruction *I = dyn_cast<Instruction>(OpLog2Of);
if (!I)
return;
- if (I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra())
+
+ if (I->getOpcode() != Instruction::FMul || !I->isFast())
return;
if (match(I->getOperand(0), m_SpecificFP(0.5)))
@@ -601,7 +602,7 @@ Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, Constant *C,
}
if (R) {
- R->setHasUnsafeAlgebra(true);
+ R->setFast(true);
InsertNewInstWith(R, *InsertBefore);
}
@@ -622,7 +623,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
- bool AllowReassociate = I.hasUnsafeAlgebra();
+ bool AllowReassociate = I.isFast();
// Simplify mul instructions with a constant RHS.
if (isa<Constant>(Op1)) {
@@ -1341,7 +1342,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
if (Instruction *R = FoldOpIntoSelect(I, SI))
return R;
- bool AllowReassociate = I.hasUnsafeAlgebra();
+ bool AllowReassociate = I.isFast();
bool AllowReciprocal = I.hasAllowReciprocal();
if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 45541c9adc0..44bbb84686a 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -310,6 +310,40 @@ static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
}
}
+// If this is a bitwise operator or add with a constant RHS we might be able
+// to pull it through a shift.
+static bool canShiftBinOpWithConstantRHS(BinaryOperator &Shift,
+ BinaryOperator *BO,
+ const APInt &C) {
+ bool IsValid = true; // Valid only for And, Or Xor,
+ bool HighBitSet = false; // Transform ifhigh bit of constant set?
+
+ switch (BO->getOpcode()) {
+ default: IsValid = false; break; // Do not perform transform!
+ case Instruction::Add:
+ IsValid = Shift.getOpcode() == Instruction::Shl;
+ break;
+ case Instruction::Or:
+ case Instruction::Xor:
+ HighBitSet = false;
+ break;
+ case Instruction::And:
+ HighBitSet = true;
+ break;
+ }
+
+ // If this is a signed shift right, and the high bit is modified
+ // by the logical operation, do not perform the transformation.
+ // The HighBitSet boolean indicates the value of the high bit of
+ // the constant which would cause it to be modified for this
+ // operation.
+ //
+ if (IsValid && Shift.getOpcode() == Instruction::AShr)
+ IsValid = C.isNegative() == HighBitSet;
+
+ return IsValid;
+}
+
Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
BinaryOperator &I) {
bool isLeftShift = I.getOpcode() == Instruction::Shl;
@@ -472,33 +506,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
// shift is the only use, we can pull it out of the shift.
const APInt *Op0C;
if (match(Op0BO->getOperand(1), m_APInt(Op0C))) {
- bool isValid = true; // Valid only for And, Or, Xor
- bool highBitSet = false; // Transform if high bit of constant set?
-
- switch (Op0BO->getOpcode()) {
- default: isValid = false; break; // Do not perform transform!
- case Instruction::Add:
- isValid = isLeftShift;
- break;
- case Instruction::Or:
- case Instruction::Xor:
- highBitSet = false;
- break;
- case Instruction::And:
- highBitSet = true;
- break;
- }
-
- // If this is a signed shift right, and the high bit is modified
- // by the logical operation, do not perform the transformation.
- // The highBitSet boolean indicates the value of the high bit of
- // the constant which would cause it to be modified for this
- // operation.
- //
- if (isValid && I.getOpcode() == Instruction::AShr)
- isValid = Op0C->isNegative() == highBitSet;
-
- if (isValid) {
+ if (canShiftBinOpWithConstantRHS(I, Op0BO, *Op0C)) {
Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
cast<Constant>(Op0BO->getOperand(1)), Op1);
@@ -525,6 +533,53 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
return BinaryOperator::CreateSub(NewRHS, NewShift);
}
}
+
+ // If we have a select that conditionally executes some binary operator,
+ // see if we can pull it the select and operator through the shift.
+ //
+ // For example, turning:
+ // shl (select C, (add X, C1), X), C2
+ // Into:
+ // Y = shl X, C2
+ // select C, (add Y, C1 << C2), Y
+ Value *Cond;
+ BinaryOperator *TBO;
+ Value *FalseVal;
+ if (match(Op0, m_Select(m_Value(Cond), m_OneUse(m_BinOp(TBO)),
+ m_Value(FalseVal)))) {
+ const APInt *C;
+ if (!isa<Constant>(FalseVal) && TBO->getOperand(0) == FalseVal &&
+ match(TBO->getOperand(1), m_APInt(C)) &&
+ canShiftBinOpWithConstantRHS(I, TBO, *C)) {
+ Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
+ cast<Constant>(TBO->getOperand(1)), Op1);
+
+ Value *NewShift =
+ Builder.CreateBinOp(I.getOpcode(), FalseVal, Op1);
+ Value *NewOp = Builder.CreateBinOp(TBO->getOpcode(), NewShift,
+ NewRHS);
+ return SelectInst::Create(Cond, NewOp, NewShift);
+ }
+ }
+
+ BinaryOperator *FBO;
+ Value *TrueVal;
+ if (match(Op0, m_Select(m_Value(Cond), m_Value(TrueVal),
+ m_OneUse(m_BinOp(FBO))))) {
+ const APInt *C;
+ if (!isa<Constant>(TrueVal) && FBO->getOperand(0) == TrueVal &&
+ match(FBO->getOperand(1), m_APInt(C)) &&
+ canShiftBinOpWithConstantRHS(I, FBO, *C)) {
+ Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
+ cast<Constant>(FBO->getOperand(1)), Op1);
+
+ Value *NewShift =
+ Builder.CreateBinOp(I.getOpcode(), TrueVal, Op1);
+ Value *NewOp = Builder.CreateBinOp(FBO->getOpcode(), NewShift,
+ NewRHS);
+ return SelectInst::Create(Cond, NewShift, NewOp);
+ }
+ }
}
return nullptr;
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 11a43e803a9..c92d48396c8 100644
--- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -844,8 +844,9 @@ public:
PGOUseFunc(Function &Func, Module *Modu,
std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
BranchProbabilityInfo *BPI = nullptr,
- BlockFrequencyInfo *BFI = nullptr)
- : F(Func), M(Modu), FuncInfo(Func, ComdatMembers, false, BPI, BFI),
+ BlockFrequencyInfo *BFIin = nullptr)
+ : F(Func), M(Modu), BFI(BFIin),
+ FuncInfo(Func, ComdatMembers, false, BPI, BFIin),
FreqAttr(FFA_Normal) {}
// Read counts for the instrumented BB from profile.
@@ -863,6 +864,9 @@ public:
// Annotate the value profile call sites for one value kind.
void annotateValueSites(uint32_t Kind);
+ // Annotate the irreducible loop header weights.
+ void annotateIrrLoopHeaderWeights();
+
// The hotness of the function from the profile count.
enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot };
@@ -894,6 +898,7 @@ public:
private:
Function &F;
Module *M;
+ BlockFrequencyInfo *BFI;
// This member stores the shared information with class PGOGenFunc.
FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo;
@@ -1183,6 +1188,18 @@ void PGOUseFunc::setBranchWeights() {
}
}
+void PGOUseFunc::annotateIrrLoopHeaderWeights() {
+ DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n");
+ // Find irr loop headers
+ for (auto &BB : F) {
+ if (BFI->isIrrLoopHeader(&BB)) {
+ TerminatorInst *TI = BB.getTerminator();
+ const UseBBInfo &BBCountInfo = getBBInfo(&BB);
+ setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue);
+ }
+ }
+}
+
void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
Module *M = F.getParent();
IRBuilder<> Builder(&SI);
@@ -1441,6 +1458,7 @@ static bool annotateAllFunctions(
Func.populateCounters();
Func.setBranchWeights();
Func.annotateValueSites();
+ Func.annotateIrrLoopHeaderWeights();
PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr();
if (FreqAttr == PGOUseFunc::FFA_Cold)
ColdFunctions.push_back(&F);
@@ -1582,6 +1600,12 @@ void llvm::setProfMetadata(Module *M, Instruction *TI,
namespace llvm {
+void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count) {
+ MDBuilder MDB(M->getContext());
+ TI->setMetadata(llvm::LLVMContext::MD_irr_loop,
+ MDB.createIrrLoopHeaderWeight(Count));
+}
+
template <> struct GraphTraits<PGOUseFunc *> {
using NodeRef = const BasicBlock *;
using ChildIteratorType = succ_const_iterator;
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index f04d0f05ffc..1e683db5020 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -18,6 +18,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
@@ -118,7 +119,8 @@ class AggressiveDeadCodeElimination {
PostDominatorTree &PDT;
/// Mapping of blocks to associated information, an element in BlockInfoVec.
- DenseMap<BasicBlock *, BlockInfoType> BlockInfo;
+ /// Use MapVector to get deterministic iteration order.
+ MapVector<BasicBlock *, BlockInfoType> BlockInfo;
bool isLive(BasicBlock *BB) { return BlockInfo[BB].Live; }
/// Mapping of instructions to associated information.
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index d79ae851005..6a27fbca8b7 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(LLVMScalarOpts
ADCE.cpp
AlignmentFromAssumptions.cpp
BDCE.cpp
+ CallSiteSplitting.cpp
ConstantHoisting.cpp
ConstantProp.cpp
CorrelatedValuePropagation.cpp
diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp
new file mode 100644
index 00000000000..b70ed8d7d4c
--- /dev/null
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -0,0 +1,492 @@
+//===- CallSiteSplitting.cpp ----------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transformation that tries to split a call-site to pass
+// more constrained arguments if its argument is predicated in the control flow
+// so that we can expose better context to the later passes (e.g, inliner, jump
+// threading, or IPA-CP based function cloning, etc.).
+// As of now we support two cases :
+//
+// 1) If a call site is dominated by an OR condition and if any of its arguments
+// are predicated on this OR condition, try to split the condition with more
+// constrained arguments. For example, in the code below, we try to split the
+// call site since we can predicate the argument(ptr) based on the OR condition.
+//
+// Split from :
+// if (!ptr || c)
+// callee(ptr);
+// to :
+// if (!ptr)
+// callee(null) // set the known constant value
+// else if (c)
+// callee(nonnull ptr) // set non-null attribute in the argument
+//
+// 2) We can also split a call-site based on constant incoming values of a PHI
+// For example,
+// from :
+// Header:
+// %c = icmp eq i32 %i1, %i2
+// br i1 %c, label %Tail, label %TBB
+// TBB:
+// br label Tail%
+// Tail:
+// %p = phi i32 [ 0, %Header], [ 1, %TBB]
+// call void @bar(i32 %p)
+// to
+// Header:
+// %c = icmp eq i32 %i1, %i2
+// br i1 %c, label %Tail-split0, label %TBB
+// TBB:
+// br label %Tail-split1
+// Tail-split0:
+// call void @bar(i32 0)
+// br label %Tail
+// Tail-split1:
+// call void @bar(i32 1)
+// br label %Tail
+// Tail:
+// %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ]
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "callsite-splitting"
+
+STATISTIC(NumCallSiteSplit, "Number of call-site split");
+
+static void addNonNullAttribute(Instruction *CallI, Instruction *&NewCallI,
+ Value *Op) {
+ if (!NewCallI)
+ NewCallI = CallI->clone();
+ CallSite CS(NewCallI);
+ unsigned ArgNo = 0;
+ for (auto &I : CS.args()) {
+ if (&*I == Op)
+ CS.addParamAttr(ArgNo, Attribute::NonNull);
+ ++ArgNo;
+ }
+}
+
+static void setConstantInArgument(Instruction *CallI, Instruction *&NewCallI,
+ Value *Op, Constant *ConstValue) {
+ if (!NewCallI)
+ NewCallI = CallI->clone();
+ CallSite CS(NewCallI);
+ unsigned ArgNo = 0;
+ for (auto &I : CS.args()) {
+ if (&*I == Op)
+ CS.setArgument(ArgNo, ConstValue);
+ ++ArgNo;
+ }
+}
+
+static bool createCallSitesOnOrPredicatedArgument(
+ CallSite CS, Instruction *&NewCSTakenFromHeader,
+ Instruction *&NewCSTakenFromNextCond,
+ SmallVectorImpl<BranchInst *> &BranchInsts, BasicBlock *HeaderBB) {
+ assert(BranchInsts.size() <= 2 &&
+ "Unexpected number of blocks in the OR predicated condition");
+ Instruction *Instr = CS.getInstruction();
+ BasicBlock *CallSiteBB = Instr->getParent();
+ TerminatorInst *HeaderTI = HeaderBB->getTerminator();
+ bool IsCSInTakenPath = CallSiteBB == HeaderTI->getSuccessor(0);
+
+ for (unsigned I = 0, E = BranchInsts.size(); I != E; ++I) {
+ BranchInst *PBI = BranchInsts[I];
+ assert(isa<ICmpInst>(PBI->getCondition()) &&
+ "Unexpected condition in a conditional branch.");
+ ICmpInst *Cmp = cast<ICmpInst>(PBI->getCondition());
+ Value *Arg = Cmp->getOperand(0);
+ assert(isa<Constant>(Cmp->getOperand(1)) &&
+ "Expected op1 to be a constant.");
+ Constant *ConstVal = cast<Constant>(Cmp->getOperand(1));
+ CmpInst::Predicate Pred = Cmp->getPredicate();
+
+ if (PBI->getParent() == HeaderBB) {
+ Instruction *&CallTakenFromHeader =
+ IsCSInTakenPath ? NewCSTakenFromHeader : NewCSTakenFromNextCond;
+ Instruction *&CallUntakenFromHeader =
+ IsCSInTakenPath ? NewCSTakenFromNextCond : NewCSTakenFromHeader;
+
+ assert((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
+ "Unexpected predicate in an OR condition");
+
+ // Set the constant value for agruments in the call predicated based on
+ // the OR condition.
+ Instruction *&CallToSetConst = Pred == ICmpInst::ICMP_EQ
+ ? CallTakenFromHeader
+ : CallUntakenFromHeader;
+ setConstantInArgument(Instr, CallToSetConst, Arg, ConstVal);
+
+ // Add the NonNull attribute if compared with the null pointer.
+ if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) {
+ Instruction *&CallToSetAttr = Pred == ICmpInst::ICMP_EQ
+ ? CallUntakenFromHeader
+ : CallTakenFromHeader;
+ addNonNullAttribute(Instr, CallToSetAttr, Arg);
+ }
+ continue;
+ }
+
+ if (Pred == ICmpInst::ICMP_EQ) {
+ if (PBI->getSuccessor(0) == Instr->getParent()) {
+ // Set the constant value for the call taken from the second block in
+ // the OR condition.
+ setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal);
+ } else {
+ // Add the NonNull attribute if compared with the null pointer for the
+ // call taken from the second block in the OR condition.
+ if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue())
+ addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg);
+ }
+ } else {
+ if (PBI->getSuccessor(0) == Instr->getParent()) {
+ // Add the NonNull attribute if compared with the null pointer for the
+ // call taken from the second block in the OR condition.
+ if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue())
+ addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg);
+ } else if (Pred == ICmpInst::ICMP_NE) {
+ // Set the constant value for the call in the untaken path from the
+ // header block.
+ setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal);
+ } else
+ llvm_unreachable("Unexpected condition");
+ }
+ }
+ return NewCSTakenFromHeader || NewCSTakenFromNextCond;
+}
+
+static bool canSplitCallSite(CallSite CS) {
+ // FIXME: As of now we handle only CallInst. InvokeInst could be handled
+ // without too much effort.
+ Instruction *Instr = CS.getInstruction();
+ if (!isa<CallInst>(Instr))
+ return false;
+
+ // Allow splitting a call-site only when there is no instruction before the
+ // call-site in the basic block. Based on this constraint, we only clone the
+ // call instruction, and we do not move a call-site across any other
+ // instruction.
+ BasicBlock *CallSiteBB = Instr->getParent();
+ if (Instr != CallSiteBB->getFirstNonPHI())
+ return false;
+
+ pred_iterator PII = pred_begin(CallSiteBB);
+ pred_iterator PIE = pred_end(CallSiteBB);
+ unsigned NumPreds = std::distance(PII, PIE);
+
+ // Allow only one extra call-site. No more than two from one call-site.
+ if (NumPreds != 2)
+ return false;
+
+ // Cannot split an edge from an IndirectBrInst.
+ BasicBlock *Preds[2] = {*PII++, *PII};
+ if (isa<IndirectBrInst>(Preds[0]->getTerminator()) ||
+ isa<IndirectBrInst>(Preds[1]->getTerminator()))
+ return false;
+
+ return CallSiteBB->canSplitPredecessors();
+}
+
+/// Return true if the CS is split into its new predecessors which are directly
+/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2.
+/// Note that PredBB1 and PredBB2 are decided in findPredicatedArgument(),
+/// especially for the OR predicated case where PredBB1 will point the header,
+/// and PredBB2 will point the the second compare block. CallInst1 and CallInst2
+/// will be the new call-sites placed in the new predecessors split for PredBB1
+/// and PredBB2, repectively. Therefore, CallInst1 will be the call-site placed
+/// between Header and Tail, and CallInst2 will be the call-site between TBB and
+/// Tail. For example, in the IR below with an OR condition, the call-site can
+/// be split
+///
+/// from :
+///
+/// Header:
+/// %c = icmp eq i32* %a, null
+/// br i1 %c %Tail, %TBB
+/// TBB:
+/// %c2 = icmp eq i32* %b, null
+/// br i1 %c %Tail, %End
+/// Tail:
+/// %ca = call i1 @callee (i32* %a, i32* %b)
+///
+/// to :
+///
+/// Header: // PredBB1 is Header
+/// %c = icmp eq i32* %a, null
+/// br i1 %c %Tail-split1, %TBB
+/// TBB: // PredBB2 is TBB
+/// %c2 = icmp eq i32* %b, null
+/// br i1 %c %Tail-split2, %End
+/// Tail-split1:
+/// %ca1 = call @callee (i32* null, i32* %b) // CallInst1
+/// br %Tail
+/// Tail-split2:
+/// %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2
+/// br %Tail
+/// Tail:
+/// %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2]
+///
+/// Note that for an OR predicated case, CallInst1 and CallInst2 should be
+/// created with more constrained arguments in
+/// createCallSitesOnOrPredicatedArgument().
+static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
+ Instruction *CallInst1, Instruction *CallInst2) {
+ Instruction *Instr = CS.getInstruction();
+ BasicBlock *TailBB = Instr->getParent();
+ assert(Instr == (TailBB->getFirstNonPHI()) && "Unexpected call-site");
+
+ BasicBlock *SplitBlock1 =
+ SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split");
+ BasicBlock *SplitBlock2 =
+ SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split");
+
+ assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split.");
+
+ if (!CallInst1)
+ CallInst1 = Instr->clone();
+ if (!CallInst2)
+ CallInst2 = Instr->clone();
+
+ CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt());
+ CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt());
+
+ CallSite CS1(CallInst1);
+ CallSite CS2(CallInst2);
+
+ // Handle PHIs used as arguments in the call-site.
+ for (auto &PI : *TailBB) {
+ PHINode *PN = dyn_cast<PHINode>(&PI);
+ if (!PN)
+ break;
+ unsigned ArgNo = 0;
+ for (auto &CI : CS.args()) {
+ if (&*CI == PN) {
+ CS1.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock1));
+ CS2.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock2));
+ }
+ ++ArgNo;
+ }
+ }
+
+ // Replace users of the original call with a PHI mering call-sites split.
+ if (Instr->getNumUses()) {
+ PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call", Instr);
+ PN->addIncoming(CallInst1, SplitBlock1);
+ PN->addIncoming(CallInst2, SplitBlock2);
+ Instr->replaceAllUsesWith(PN);
+ }
+ DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
+ DEBUG(dbgs() << " " << *CallInst1 << " in " << SplitBlock1->getName()
+ << "\n");
+ DEBUG(dbgs() << " " << *CallInst2 << " in " << SplitBlock2->getName()
+ << "\n");
+ Instr->eraseFromParent();
+ NumCallSiteSplit++;
+}
+
+static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) {
+ assert(isa<Constant>(Cmp->getOperand(1)) && "Expected a constant operand.");
+ Value *Op0 = Cmp->getOperand(0);
+ unsigned ArgNo = 0;
+ for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
+ ++I, ++ArgNo) {
+ // Don't consider constant or arguments that are already known non-null.
+ if (isa<Constant>(*I) || CS.paramHasAttr(ArgNo, Attribute::NonNull))
+ continue;
+
+ if (*I == Op0)
+ return true;
+ }
+ return false;
+}
+
+static void findOrCondRelevantToCallArgument(
+ CallSite CS, BasicBlock *PredBB, BasicBlock *OtherPredBB,
+ SmallVectorImpl<BranchInst *> &BranchInsts, BasicBlock *&HeaderBB) {
+ auto *PBI = dyn_cast<BranchInst>(PredBB->getTerminator());
+ if (!PBI || !PBI->isConditional())
+ return;
+
+ if (PBI->getSuccessor(0) == OtherPredBB ||
+ PBI->getSuccessor(1) == OtherPredBB)
+ if (PredBB == OtherPredBB->getSinglePredecessor()) {
+ assert(!HeaderBB && "Expect to find only a single header block");
+ HeaderBB = PredBB;
+ }
+
+ CmpInst::Predicate Pred;
+ Value *Cond = PBI->getCondition();
+ if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant())))
+ return;
+ ICmpInst *Cmp = cast<ICmpInst>(Cond);
+ if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)
+ if (isCondRelevantToAnyCallArgument(Cmp, CS))
+ BranchInsts.push_back(PBI);
+}
+
+// Return true if the call-site has an argument which is a PHI with only
+// constant incoming values.
+static bool isPredicatedOnPHI(CallSite CS) {
+ Instruction *Instr = CS.getInstruction();
+ BasicBlock *Parent = Instr->getParent();
+ if (Instr != Parent->getFirstNonPHI())
+ return false;
+
+ for (auto &BI : *Parent) {
+ if (PHINode *PN = dyn_cast<PHINode>(&BI)) {
+ for (auto &I : CS.args())
+ if (&*I == PN) {
+ assert(PN->getNumIncomingValues() == 2 &&
+ "Unexpected number of incoming values");
+ if (PN->getIncomingBlock(0) == PN->getIncomingBlock(1))
+ return false;
+ if (PN->getIncomingValue(0) == PN->getIncomingValue(1))
+ continue;
+ if (isa<Constant>(PN->getIncomingValue(0)) &&
+ isa<Constant>(PN->getIncomingValue(1)))
+ return true;
+ }
+ }
+ break;
+ }
+ return false;
+}
+
+// Return true if an agument in CS is predicated on an 'or' condition.
+// Create new call-site with arguments constrained based on the OR condition.
+static bool findPredicatedOnOrCondition(CallSite CS, BasicBlock *PredBB1,
+ BasicBlock *PredBB2,
+ Instruction *&NewCallTakenFromHeader,
+ Instruction *&NewCallTakenFromNextCond,
+ BasicBlock *&HeaderBB) {
+ SmallVector<BranchInst *, 4> BranchInsts;
+ findOrCondRelevantToCallArgument(CS, PredBB1, PredBB2, BranchInsts, HeaderBB);
+ findOrCondRelevantToCallArgument(CS, PredBB2, PredBB1, BranchInsts, HeaderBB);
+ if (BranchInsts.empty() || !HeaderBB)
+ return false;
+
+ // If an OR condition is detected, try to create call sites with constrained
+ // arguments (e.g., NonNull attribute or constant value).
+ return createCallSitesOnOrPredicatedArgument(CS, NewCallTakenFromHeader,
+ NewCallTakenFromNextCond,
+ BranchInsts, HeaderBB);
+}
+
+static bool findPredicatedArgument(CallSite CS, Instruction *&CallInst1,
+ Instruction *&CallInst2,
+ BasicBlock *&PredBB1, BasicBlock *&PredBB2) {
+ BasicBlock *CallSiteBB = CS.getInstruction()->getParent();
+ pred_iterator PII = pred_begin(CallSiteBB);
+ pred_iterator PIE = pred_end(CallSiteBB);
+ assert(std::distance(PII, PIE) == 2 && "Expect only two predecessors.");
+ (void)PIE;
+ BasicBlock *Preds[2] = {*PII++, *PII};
+ BasicBlock *&HeaderBB = PredBB1;
+ if (!findPredicatedOnOrCondition(CS, Preds[0], Preds[1], CallInst1, CallInst2,
+ HeaderBB) &&
+ !isPredicatedOnPHI(CS))
+ return false;
+
+ if (!PredBB1)
+ PredBB1 = Preds[0];
+
+ PredBB2 = PredBB1 == Preds[0] ? Preds[1] : Preds[0];
+ return true;
+}
+
+static bool tryToSplitCallSite(CallSite CS) {
+ if (!CS.arg_size())
+ return false;
+
+ BasicBlock *PredBB1 = nullptr;
+ BasicBlock *PredBB2 = nullptr;
+ Instruction *CallInst1 = nullptr;
+ Instruction *CallInst2 = nullptr;
+ if (!canSplitCallSite(CS) ||
+ !findPredicatedArgument(CS, CallInst1, CallInst2, PredBB1, PredBB2)) {
+ assert(!CallInst1 && !CallInst2 && "Unexpected new call-sites cloned.");
+ return false;
+ }
+ splitCallSite(CS, PredBB1, PredBB2, CallInst1, CallInst2);
+ return true;
+}
+
+static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
+ bool Changed = false;
+ for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
+ BasicBlock &BB = *BI++;
+ for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
+ Instruction *I = &*II++;
+ CallSite CS(cast<Value>(I));
+ if (!CS || isa<IntrinsicInst>(I) || isInstructionTriviallyDead(I, &TLI))
+ continue;
+
+ Function *Callee = CS.getCalledFunction();
+ if (!Callee || Callee->isDeclaration())
+ continue;
+ Changed |= tryToSplitCallSite(CS);
+ }
+ }
+ return Changed;
+}
+
+namespace {
+struct CallSiteSplittingLegacyPass : public FunctionPass {
+ static char ID;
+ CallSiteSplittingLegacyPass() : FunctionPass(ID) {
+ initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ return doCallSiteSplitting(F, TLI);
+ }
+};
+} // namespace
+
+char CallSiteSplittingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting",
+ "Call-site splitting", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting",
+ "Call-site splitting", false, false)
+FunctionPass *llvm::createCallSiteSplittingPass() {
+ return new CallSiteSplittingLegacyPass();
+}
+
+PreservedAnalyses CallSiteSplittingPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
+ if (!doCallSiteSplitting(F, TLI))
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ return PA;
+}
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 9ce42a06825..abb50f27f1c 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -48,6 +48,7 @@
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
@@ -1624,6 +1625,15 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
if (DU.NarrowDef->use_empty())
DeadInsts.emplace_back(DU.NarrowDef);
}
+
+ // Attach any debug information to the new PHI. Since OrigPhi and WidePHI
+ // evaluate the same recurrence, we can just copy the debug info over.
+ SmallVector<DbgValueInst *, 1> DbgValues;
+ llvm::findDbgValues(DbgValues, OrigPhi);
+ auto *MDPhi = MetadataAsValue::get(WidePhi->getContext(),
+ ValueAsMetadata::get(WidePhi));
+ for (auto &DbgValue : DbgValues)
+ DbgValue->setOperand(0, MDPhi);
return WidePhi;
}
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index ade4fbbcb6f..e6cab3f34cf 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -192,11 +192,12 @@ JumpThreadingPass::JumpThreadingPass(int T) {
// P(cond == true ) = P(A) + P(cond == true | B) * P(B)
//
// which gives us:
-// P(A) <= P(c == true), i.e.
+// P(A) is less than P(cond == true), i.e.
// P(t == true) <= P(cond == true)
//
-// In other words, if we know P(cond == true), we know that P(t == true)
-// can not be greater than 1%.
+// In other words, if we know P(cond == true) is unlikely, we know
+// that P(t == true) is also unlikely.
+//
static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
if (!CondBr)
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 6ca8d602302..c60ec9f50f7 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -62,6 +62,7 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -93,9 +94,8 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
const LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE);
-static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
- const Loop *CurLoop, AliasSetTracker *CurAST,
- const LoopSafetyInfo *SafetyInfo,
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
+ const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE);
static bool isSafeToExecuteUnconditionally(Instruction &Inst,
const DominatorTree *DT,
@@ -394,8 +394,12 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
//
if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) {
- ++II;
- Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo, ORE);
+ if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE)) {
+ ++II;
+ CurAST->deleteValue(&I);
+ I.eraseFromParent();
+ Changed = true;
+ }
}
}
}
@@ -717,26 +721,6 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
if (!BlockColors.empty() &&
BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1)
return false;
-
- // A PHI node where all of the incoming values are this instruction are
- // special -- they can just be RAUW'ed with the instruction and thus
- // don't require a use in the predecessor. This is a particular important
- // special case because it is the pattern found in LCSSA form.
- if (isTriviallyReplacablePHI(*PN, I)) {
- if (CurLoop->contains(PN))
- return false;
- else
- continue;
- }
-
- // Otherwise, PHI node uses occur in predecessor blocks if the incoming
- // values. Check for such a use being inside the loop.
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
- if (PN->getIncomingValue(i) == &I)
- if (CurLoop->contains(PN->getIncomingBlock(i)))
- return false;
-
- continue;
}
if (CurLoop->contains(UI))
@@ -806,14 +790,96 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
return New;
}
+static Instruction *sinkThroughTriviallyReplacablePHI(
+ PHINode *TPN, Instruction *I, LoopInfo *LI,
+ SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
+ const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop) {
+ assert(isTriviallyReplacablePHI(*TPN, *I) &&
+ "Expect only trivially replacalbe PHI");
+ BasicBlock *ExitBlock = TPN->getParent();
+ Instruction *New;
+ auto It = SunkCopies.find(ExitBlock);
+ if (It != SunkCopies.end())
+ New = It->second;
+ else
+ New = SunkCopies[ExitBlock] =
+ CloneInstructionInExitBlock(*I, *ExitBlock, *TPN, LI, SafetyInfo);
+ return New;
+}
+
+static bool canSplitPredecessors(PHINode *PN) {
+ BasicBlock *BB = PN->getParent();
+ if (!BB->canSplitPredecessors())
+ return false;
+ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+ BasicBlock *BBPred = *PI;
+ if (isa<IndirectBrInst>(BBPred->getTerminator()))
+ return false;
+ }
+ return true;
+}
+
+static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
+ LoopInfo *LI, const Loop *CurLoop) {
+#ifndef NDEBUG
+ SmallVector<BasicBlock *, 32> ExitBlocks;
+ CurLoop->getUniqueExitBlocks(ExitBlocks);
+ SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
+#endif
+ BasicBlock *ExitBB = PN->getParent();
+ assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block.");
+
+ // Split predecessors of the loop exit to make instructions in the loop are
+ // exposed to exit blocks through trivially replacable PHIs while keeping the
+ // loop in the canonical form where each predecessor of each exit block should
+ // be contained within the loop. For example, this will convert the loop below
+ // from
+ //
+ // LB1:
+ // %v1 =
+ // br %LE, %LB2
+ // LB2:
+ // %v2 =
+ // br %LE, %LB1
+ // LE:
+ // %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replacable
+ //
+ // to
+ //
+ // LB1:
+ // %v1 =
+ // br %LE.split, %LB2
+ // LB2:
+ // %v2 =
+ // br %LE.split2, %LB1
+ // LE.split:
+ // %p1 = phi [%v1, %LB1] <-- trivially replacable
+ // br %LE
+ // LE.split2:
+ // %p2 = phi [%v2, %LB2] <-- trivially replacable
+ // br %LE
+ // LE:
+ // %p = phi [%p1, %LE.split], [%p2, %LE.split2]
+ //
+ SmallSetVector<BasicBlock *, 8> PredBBs(pred_begin(ExitBB), pred_end(ExitBB));
+ while (!PredBBs.empty()) {
+ BasicBlock *PredBB = *PredBBs.begin();
+ assert(CurLoop->contains(PredBB) &&
+ "Expect all predecessors are in the loop");
+ if (PN->getBasicBlockIndex(PredBB) >= 0)
+ SplitBlockPredecessors(ExitBB, PredBB, ".split.loop.exit", DT, LI, true);
+ PredBBs.remove(PredBB);
+ }
+}
+
/// When an instruction is found to only be used outside of the loop, this
/// function moves it to the exit blocks and patches up SSA form as needed.
/// This method is guaranteed to remove the original instruction from its
/// position, and may either delete it or move it to outside of the loop.
///
-static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
- const Loop *CurLoop, AliasSetTracker *CurAST,
- const LoopSafetyInfo *SafetyInfo,
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
+ const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE) {
DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
ORE->emit([&]() {
@@ -828,57 +894,75 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
++NumSunk;
Changed = true;
-#ifndef NDEBUG
- SmallVector<BasicBlock *, 32> ExitBlocks;
- CurLoop->getUniqueExitBlocks(ExitBlocks);
- SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
- ExitBlocks.end());
-#endif
+ // Iterate over users to be ready for actual sinking. Replace users via
+ // unrechable blocks with undef and make all user PHIs trivially replcable.
+ SmallPtrSet<Instruction *, 8> VisitedUsers;
+ for (Value::user_iterator UI = I.user_begin(), UE = I.user_end(); UI != UE;) {
+ auto *User = cast<Instruction>(*UI);
+ Use &U = UI.getUse();
+ ++UI;
- // Clones of this instruction. Don't create more than one per exit block!
- SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+ if (VisitedUsers.count(User))
+ continue;
- // If this instruction is only used outside of the loop, then all users are
- // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
- // the instruction.
- while (!I.use_empty()) {
- Value::user_iterator UI = I.user_begin();
- auto *User = cast<Instruction>(*UI);
if (!DT->isReachableFromEntry(User->getParent())) {
User->replaceUsesOfWith(&I, UndefValue::get(I.getType()));
continue;
}
+
// The user must be a PHI node.
PHINode *PN = cast<PHINode>(User);
// Surprisingly, instructions can be used outside of loops without any
// exits. This can only happen in PHI nodes if the incoming block is
// unreachable.
- Use &U = UI.getUse();
BasicBlock *BB = PN->getIncomingBlock(U);
if (!DT->isReachableFromEntry(BB)) {
U = UndefValue::get(I.getType());
continue;
}
- BasicBlock *ExitBlock = PN->getParent();
- assert(ExitBlockSet.count(ExitBlock) &&
- "The LCSSA PHI is not in an exit block!");
+ VisitedUsers.insert(PN);
+ if (isTriviallyReplacablePHI(*PN, I))
+ continue;
- Instruction *New;
- auto It = SunkCopies.find(ExitBlock);
- if (It != SunkCopies.end())
- New = It->second;
- else
- New = SunkCopies[ExitBlock] =
- CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI, SafetyInfo);
+ if (!canSplitPredecessors(PN))
+ return false;
+
+ // Split predecessors of the PHI so that we can make users trivially
+ // replacable.
+ splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop);
+ // Should rebuild the iterators, as they may be invalidated by
+ // splitPredecessorsOfLoopExit().
+ UI = I.user_begin();
+ UE = I.user_end();
+ }
+
+#ifndef NDEBUG
+ SmallVector<BasicBlock *, 32> ExitBlocks;
+ CurLoop->getUniqueExitBlocks(ExitBlocks);
+ SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
+#endif
+
+ // Clones of this instruction. Don't create more than one per exit block!
+ SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+
+ // If this instruction is only used outside of the loop, then all users are
+ // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
+ // the instruction.
+ while (!I.use_empty()) {
+ Value::user_iterator UI = I.user_begin();
+ PHINode *PN = cast<PHINode>(*UI);
+ assert(ExitBlockSet.count(PN->getParent()) &&
+ "The LCSSA PHI is not in an exit block!");
+ // The PHI must be trivially replacable.
+ Instruction *New = sinkThroughTriviallyReplacablePHI(PN, &I, LI, SunkCopies,
+ SafetyInfo, CurLoop);
PN->replaceAllUsesWith(New);
PN->eraseFromParent();
}
-
- CurAST->deleteValue(&I);
- I.eraseFromParent();
return Changed;
}
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 413fb75d172..eb5f3cc47ce 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1326,9 +1326,9 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
// step 2: detect instructions corresponding to "x.next = x >> 1"
if (!DefX || DefX->getOpcode() != Instruction::AShr)
return false;
- if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)))
- if (!Shft || !Shft->isOne())
- return false;
+ ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
+ if (!Shft || !Shft->isOne())
+ return false;
VarX = DefX->getOperand(0);
// step 3: Check the recurrence of variable X
diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp
index 9a623be234f..52dea3254e7 100644
--- a/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/lib/Transforms/Scalar/LoopPredication.cpp
@@ -174,6 +174,9 @@
using namespace llvm;
+static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation",
+ cl::Hidden, cl::init(true));
+
namespace {
class LoopPredication {
/// Represents an induction variable check:
@@ -186,6 +189,10 @@ class LoopPredication {
const SCEV *Limit)
: Pred(Pred), IV(IV), Limit(Limit) {}
LoopICmp() {}
+ void dump() {
+ dbgs() << "LoopICmp Pred = " << Pred << ", IV = " << *IV
+ << ", Limit = " << *Limit << "\n";
+ }
};
ScalarEvolution *SE;
@@ -195,6 +202,7 @@ class LoopPredication {
BasicBlock *Preheader;
LoopICmp LatchCheck;
+ bool isSupportedStep(const SCEV* Step);
Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI) {
return parseLoopICmp(ICI->getPredicate(), ICI->getOperand(0),
ICI->getOperand(1));
@@ -204,14 +212,36 @@ class LoopPredication {
Optional<LoopICmp> parseLoopLatchICmp();
+ bool CanExpand(const SCEV* S);
Value *expandCheck(SCEVExpander &Expander, IRBuilder<> &Builder,
ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
Instruction *InsertAt);
Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander,
IRBuilder<> &Builder);
+ Optional<Value *> widenICmpRangeCheckIncrementingLoop(LoopICmp LatchCheck,
+ LoopICmp RangeCheck,
+ SCEVExpander &Expander,
+ IRBuilder<> &Builder);
+
bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
+ // When the IV type is wider than the range operand type, we can still do loop
+ // predication, by generating SCEVs for the range and latch that are of the
+ // same type. We achieve this by generating a SCEV truncate expression for the
+ // latch IV. This is done iff truncation of the IV is a safe operation,
+ // without loss of information.
+ // Another way to achieve this is by generating a wider type SCEV for the
+ // range check operand, however, this needs a more involved check that
+ // operands do not overflow. This can lead to loss of information when the
+ // range operand is of the form: add i32 %offset, %iv. We need to prove that
+ // sext(x + y) is same as sext(x) + sext(y).
+ // This function returns true if we can safely represent the IV type in
+ // the RangeCheckType without loss of information.
+ bool isSafeToTruncateWideIVType(Type *RangeCheckType);
+ // Return the loopLatchCheck corresponding to the RangeCheckType if safe to do
+ // so.
+ Optional<LoopICmp> generateLoopLatchCheck(Type *RangeCheckType);
public:
LoopPredication(ScalarEvolution *SE) : SE(SE){};
bool runOnLoop(Loop *L);
@@ -301,53 +331,54 @@ Value *LoopPredication::expandCheck(SCEVExpander &Expander,
return Builder.CreateICmp(Pred, LHSV, RHSV);
}
-/// If ICI can be widened to a loop invariant condition emits the loop
-/// invariant condition in the loop preheader and return it, otherwise
-/// returns None.
-Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
- SCEVExpander &Expander,
- IRBuilder<> &Builder) {
- DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
- DEBUG(ICI->dump());
+Optional<LoopPredication::LoopICmp>
+LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) {
- // parseLoopStructure guarantees that the latch condition is:
- // ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=.
- // We are looking for the range checks of the form:
- // i u< guardLimit
- auto RangeCheck = parseLoopICmp(ICI);
- if (!RangeCheck) {
- DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+ auto *LatchType = LatchCheck.IV->getType();
+ if (RangeCheckType == LatchType)
+ return LatchCheck;
+ // For now, bail out if latch type is narrower than range type.
+ if (DL->getTypeSizeInBits(LatchType) < DL->getTypeSizeInBits(RangeCheckType))
return None;
- }
- if (RangeCheck->Pred != ICmpInst::ICMP_ULT) {
- DEBUG(dbgs() << "Unsupported range check predicate(" << RangeCheck->Pred
- << ")!\n");
+ if (!isSafeToTruncateWideIVType(RangeCheckType))
return None;
- }
- auto *RangeCheckIV = RangeCheck->IV;
- auto *Ty = RangeCheckIV->getType();
- if (Ty != LatchCheck.IV->getType()) {
- DEBUG(dbgs() << "Type mismatch between range check and latch IVs!\n");
+ // We can now safely identify the truncated version of the IV and limit for
+ // RangeCheckType.
+ LoopICmp NewLatchCheck;
+ NewLatchCheck.Pred = LatchCheck.Pred;
+ NewLatchCheck.IV = dyn_cast<SCEVAddRecExpr>(
+ SE->getTruncateExpr(LatchCheck.IV, RangeCheckType));
+ if (!NewLatchCheck.IV)
return None;
- }
- if (!RangeCheckIV->isAffine()) {
- DEBUG(dbgs() << "Range check IV is not affine!\n");
- return None;
- }
- auto *Step = RangeCheckIV->getStepRecurrence(*SE);
- if (Step != LatchCheck.IV->getStepRecurrence(*SE)) {
- DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
- return None;
- }
- assert(Step->isOne() && "must be one");
+ NewLatchCheck.Limit = SE->getTruncateExpr(LatchCheck.Limit, RangeCheckType);
+ DEBUG(dbgs() << "IV of type: " << *LatchType
+ << "can be represented as range check type:" << *RangeCheckType
+ << "\n");
+ DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n");
+ DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n");
+ return NewLatchCheck;
+}
+
+bool LoopPredication::isSupportedStep(const SCEV* Step) {
+ return Step->isOne();
+}
- // Generate the widened condition:
+bool LoopPredication::CanExpand(const SCEV* S) {
+ return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE);
+}
+
+Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
+ LoopPredication::LoopICmp LatchCheck, LoopPredication::LoopICmp RangeCheck,
+ SCEVExpander &Expander, IRBuilder<> &Builder) {
+ auto *Ty = RangeCheck.IV->getType();
+ // Generate the widened condition for the forward loop:
// guardStart u< guardLimit &&
// latchLimit <pred> guardLimit - 1 - guardStart + latchStart
// where <pred> depends on the latch condition predicate. See the file
// header comment for the reasoning.
- const SCEV *GuardStart = RangeCheckIV->getStart();
- const SCEV *GuardLimit = RangeCheck->Limit;
+ // guardLimit - guardStart + latchStart - 1
+ const SCEV *GuardStart = RangeCheck.IV->getStart();
+ const SCEV *GuardLimit = RangeCheck.Limit;
const SCEV *LatchStart = LatchCheck.IV->getStart();
const SCEV *LatchLimit = LatchCheck.Limit;
@@ -355,7 +386,11 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
const SCEV *RHS =
SE->getAddExpr(SE->getMinusSCEV(GuardLimit, GuardStart),
SE->getMinusSCEV(LatchStart, SE->getOne(Ty)));
-
+ if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
+ !CanExpand(LatchLimit) || !CanExpand(RHS)) {
+ DEBUG(dbgs() << "Can't expand limit check!\n");
+ return None;
+ }
ICmpInst::Predicate LimitCheckPred;
switch (LatchCheck.Pred) {
case ICmpInst::ICMP_ULT:
@@ -378,22 +413,68 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
DEBUG(dbgs() << "RHS: " << *RHS << "\n");
DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
- auto CanExpand = [this](const SCEV *S) {
- return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE);
- };
- if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
- !CanExpand(LatchLimit) || !CanExpand(RHS)) {
- DEBUG(dbgs() << "Can't expand limit check!\n");
- return None;
- }
-
Instruction *InsertAt = Preheader->getTerminator();
auto *LimitCheck =
expandCheck(Expander, Builder, LimitCheckPred, LatchLimit, RHS, InsertAt);
- auto *FirstIterationCheck = expandCheck(Expander, Builder, RangeCheck->Pred,
+ auto *FirstIterationCheck = expandCheck(Expander, Builder, RangeCheck.Pred,
GuardStart, GuardLimit, InsertAt);
return Builder.CreateAnd(FirstIterationCheck, LimitCheck);
}
+/// If ICI can be widened to a loop invariant condition emits the loop
+/// invariant condition in the loop preheader and return it, otherwise
+/// returns None.
+Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
+ SCEVExpander &Expander,
+ IRBuilder<> &Builder) {
+ DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
+ DEBUG(ICI->dump());
+
+ // parseLoopStructure guarantees that the latch condition is:
+ // ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=.
+ // We are looking for the range checks of the form:
+ // i u< guardLimit
+ auto RangeCheck = parseLoopICmp(ICI);
+ if (!RangeCheck) {
+ DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+ return None;
+ }
+ DEBUG(dbgs() << "Guard check:\n");
+ DEBUG(RangeCheck->dump());
+ if (RangeCheck->Pred != ICmpInst::ICMP_ULT) {
+ DEBUG(dbgs() << "Unsupported range check predicate(" << RangeCheck->Pred
+ << ")!\n");
+ return None;
+ }
+ auto *RangeCheckIV = RangeCheck->IV;
+ if (!RangeCheckIV->isAffine()) {
+ DEBUG(dbgs() << "Range check IV is not affine!\n");
+ return None;
+ }
+ auto *Step = RangeCheckIV->getStepRecurrence(*SE);
+ // We cannot just compare with latch IV step because the latch and range IVs
+ // may have different types.
+ if (!isSupportedStep(Step)) {
+ DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
+ return None;
+ }
+ auto *Ty = RangeCheckIV->getType();
+ auto CurrLatchCheckOpt = generateLoopLatchCheck(Ty);
+ if (!CurrLatchCheckOpt) {
+ DEBUG(dbgs() << "Failed to generate a loop latch check "
+ "corresponding to range type: "
+ << *Ty << "\n");
+ return None;
+ }
+
+ LoopICmp CurrLatchCheck = *CurrLatchCheckOpt;
+ // At this point the range check step and latch step should have the same
+ // value and type.
+ assert(Step == CurrLatchCheck.IV->getStepRecurrence(*SE) &&
+ "Range and latch should have same step recurrence!");
+
+ return widenICmpRangeCheckIncrementingLoop(CurrLatchCheck, *RangeCheck,
+ Expander, Builder);
+}
bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
SCEVExpander &Expander) {
@@ -485,15 +566,6 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
return None;
}
- if (Result->Pred != ICmpInst::ICMP_ULT &&
- Result->Pred != ICmpInst::ICMP_SLT &&
- Result->Pred != ICmpInst::ICMP_ULE &&
- Result->Pred != ICmpInst::ICMP_SLE) {
- DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
- << ")!\n");
- return None;
- }
-
// Check affine first, so if it's not we don't try to compute the step
// recurrence.
if (!Result->IV->isAffine()) {
@@ -502,14 +574,55 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
}
auto *Step = Result->IV->getStepRecurrence(*SE);
- if (!Step->isOne()) {
+ if (!isSupportedStep(Step)) {
DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n");
return None;
}
+ auto IsUnsupportedPredicate = [](const SCEV *Step, ICmpInst::Predicate Pred) {
+ assert(Step->isOne() && "expected Step to be one!");
+ return Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_SLT &&
+ Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE;
+ };
+
+ if (IsUnsupportedPredicate(Step, Result->Pred)) {
+ DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
+ << ")!\n");
+ return None;
+ }
return Result;
}
+// Returns true if its safe to truncate the IV to RangeCheckType.
+bool LoopPredication::isSafeToTruncateWideIVType(Type *RangeCheckType) {
+ if (!EnableIVTruncation)
+ return false;
+ assert(DL->getTypeSizeInBits(LatchCheck.IV->getType()) >
+ DL->getTypeSizeInBits(RangeCheckType) &&
+ "Expected latch check IV type to be larger than range check operand "
+ "type!");
+ // The start and end values of the IV should be known. This is to guarantee
+ // that truncating the wide type will not lose information.
+ auto *Limit = dyn_cast<SCEVConstant>(LatchCheck.Limit);
+ auto *Start = dyn_cast<SCEVConstant>(LatchCheck.IV->getStart());
+ if (!Limit || !Start)
+ return false;
+ // This check makes sure that the IV does not change sign during loop
+ // iterations. Consider latchType = i64, LatchStart = 5, Pred = ICMP_SGE,
+ // LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the
+ // IV wraps around, and the truncation of the IV would lose the range of
+ // iterations between 2^32 and 2^64.
+ bool Increasing;
+ if (!SE->isMonotonicPredicate(LatchCheck.IV, LatchCheck.Pred, Increasing))
+ return false;
+ // The active bits should be less than the bits in the RangeCheckType. This
+ // guarantees that truncating the latch check to RangeCheckType is a safe
+ // operation.
+ auto RangeCheckTypeBitSize = DL->getTypeSizeInBits(RangeCheckType);
+ return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize &&
+ Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize;
+}
+
bool LoopPredication::runOnLoop(Loop *Loop) {
L = Loop;
@@ -535,6 +648,9 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
return false;
LatchCheck = *LatchCheckOpt;
+ DEBUG(dbgs() << "Latch check:\n");
+ DEBUG(LatchCheck.dump());
+
// Collect all the guards into a vector and process later, so as not
// to invalidate the instruction iterator.
SmallVector<IntrinsicInst *, 4> Guards;
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index bbb179d3790..7f03f2379e7 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1037,7 +1037,7 @@ struct LSRFixup {
Value *OperandValToReplace = nullptr;
/// If this user is to use the post-incremented value of an induction
- /// variable, this variable is non-null and holds the loop associated with the
+ /// variable, this set is non-empty and holds the loops associated with the
/// induction variable.
PostIncLoopSet PostIncLoops;
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index a44ca333fee..1f32f9f24aa 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -145,8 +145,7 @@ XorOpnd::XorOpnd(Value *V) {
static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
if (V->hasOneUse() && isa<Instruction>(V) &&
cast<Instruction>(V)->getOpcode() == Opcode &&
- (!isa<FPMathOperator>(V) ||
- cast<Instruction>(V)->hasUnsafeAlgebra()))
+ (!isa<FPMathOperator>(V) || cast<Instruction>(V)->isFast()))
return cast<BinaryOperator>(V);
return nullptr;
}
@@ -156,8 +155,7 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
if (V->hasOneUse() && isa<Instruction>(V) &&
(cast<Instruction>(V)->getOpcode() == Opcode1 ||
cast<Instruction>(V)->getOpcode() == Opcode2) &&
- (!isa<FPMathOperator>(V) ||
- cast<Instruction>(V)->hasUnsafeAlgebra()))
+ (!isa<FPMathOperator>(V) || cast<Instruction>(V)->isFast()))
return cast<BinaryOperator>(V);
return nullptr;
}
@@ -565,7 +563,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
assert((!isa<Instruction>(Op) ||
cast<Instruction>(Op)->getOpcode() != Opcode
|| (isa<FPMathOperator>(Op) &&
- !cast<Instruction>(Op)->hasUnsafeAlgebra())) &&
+ !cast<Instruction>(Op)->isFast())) &&
"Should have been handled above!");
assert(Op->hasOneUse() && "Has uses outside the expression tree!");
@@ -2017,8 +2015,8 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
if (I->isCommutative())
canonicalizeOperands(I);
- // Don't optimize floating point instructions that don't have unsafe algebra.
- if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())
+ // Don't optimize floating-point instructions unless they are 'fast'.
+ if (I->getType()->isFPOrFPVectorTy() && !I->isFast())
return;
// Do not reassociate boolean (i1) expressions. We want to preserve the
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 1ca77cfec32..44acfc88579 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -125,10 +125,10 @@ struct RewriteStatepointsForGC : public ModulePass {
Changed |= runOnFunction(F);
if (Changed) {
- // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn
+ // stripNonValidData asserts that shouldRewriteStatepointsIn
// returns true for at least one function in the module. Since at least
// one function changed, we know that the precondition is satisfied.
- stripNonValidAttributesAndMetadata(M);
+ stripNonValidData(M);
}
return Changed;
@@ -146,15 +146,17 @@ struct RewriteStatepointsForGC : public ModulePass {
/// metadata implying dereferenceability that are no longer valid/correct after
/// RewriteStatepointsForGC has run. This is because semantically, after
/// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
- /// heap. stripNonValidAttributesAndMetadata (conservatively) restores
+ /// heap. stripNonValidData (conservatively) restores
/// correctness by erasing all attributes in the module that externally imply
/// dereferenceability. Similar reasoning also applies to the noalias
/// attributes and metadata. gc.statepoint can touch the entire heap including
/// noalias objects.
- void stripNonValidAttributesAndMetadata(Module &M);
+ /// Apart from attributes and metadata, we also remove instructions that imply
+ /// constant physical memory: llvm.invariant.start.
+ void stripNonValidData(Module &M);
- // Helpers for stripNonValidAttributesAndMetadata
- void stripNonValidAttributesAndMetadataFromBody(Function &F);
+ // Helpers for stripNonValidData
+ void stripNonValidDataFromBody(Function &F);
void stripNonValidAttributesFromPrototype(Function &F);
// Certain metadata on instructions are invalid after running RS4GC.
@@ -2385,14 +2387,30 @@ void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I
I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC);
}
-void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) {
+void RewriteStatepointsForGC::stripNonValidDataFromBody(Function &F) {
if (F.empty())
return;
LLVMContext &Ctx = F.getContext();
MDBuilder Builder(Ctx);
+ // Set of invariantstart instructions that we need to remove.
+ // Use this to avoid invalidating the instruction iterator.
+ SmallVector<IntrinsicInst*, 12> InvariantStartInstructions;
+
for (Instruction &I : instructions(F)) {
+ // invariant.start on memory location implies that the referenced memory
+ // location is constant and unchanging. This is no longer true after
+ // RewriteStatepointsForGC runs because there can be calls to gc.statepoint
+ // which frees the entire heap and the presence of invariant.start allows
+ // the optimizer to sink the load of a memory location past a statepoint,
+ // which is incorrect.
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::invariant_start) {
+ InvariantStartInstructions.push_back(II);
+ continue;
+ }
+
if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
bool IsImmutableTBAA =
@@ -2422,6 +2440,12 @@ void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Functio
RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex);
}
}
+
+ // Delete the invariant.start instructions and RAUW undef.
+ for (auto *II : InvariantStartInstructions) {
+ II->replaceAllUsesWith(UndefValue::get(II->getType()));
+ II->eraseFromParent();
+ }
}
/// Returns true if this function should be rewritten by this pass. The main
@@ -2438,7 +2462,7 @@ static bool shouldRewriteStatepointsIn(Function &F) {
return false;
}
-void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) {
+void RewriteStatepointsForGC::stripNonValidData(Module &M) {
#ifndef NDEBUG
assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!");
#endif
@@ -2447,7 +2471,7 @@ void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) {
stripNonValidAttributesFromPrototype(F);
for (Function &F : M)
- stripNonValidAttributesAndMetadataFromBody(F);
+ stripNonValidDataFromBody(F);
}
bool RewriteStatepointsForGC::runOnFunction(Function &F) {
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index b968cb8c892..6de6c8cce2c 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -4133,8 +4133,10 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
"new fragment is outside of original fragment");
Start -= OrigFragment->OffsetInBits;
}
- FragmentExpr =
- DIExpression::createFragmentExpression(Expr, Start, Size);
+ if (auto E = DIExpression::createFragmentExpression(Expr, Start, Size))
+ FragmentExpr = *E;
+ else
+ continue;
}
// Remove any existing intrinsics describing the same alloca.
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index c1034ace206..8a5ae1b8731 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -35,6 +35,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeADCELegacyPassPass(Registry);
initializeBDCELegacyPassPass(Registry);
initializeAlignmentFromAssumptionsPass(Registry);
+ initializeCallSiteSplittingLegacyPassPass(Registry);
initializeConstantHoistingLegacyPassPass(Registry);
initializeConstantPropagationPass(Registry);
initializeCorrelatedValuePropagationPass(Registry);
diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp
index fbb61ac1ae9..2e6fc4e8482 100644
--- a/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -203,6 +203,23 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV,
}
void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
+
+ // Check the summaries to see if the symbol gets resolved to a known local
+ // definition.
+ if (GV.hasName()) {
+ ValueInfo VI = ImportIndex.getValueInfo(GV.getGUID());
+ if (VI) {
+ // Need to check all summaries are local in case of hash collisions.
+ bool IsLocal = VI.getSummaryList().size() &&
+ llvm::all_of(VI.getSummaryList(),
+ [](const std::unique_ptr<GlobalValueSummary> &Summary) {
+ return Summary->isDSOLocal();
+ });
+ if (IsLocal)
+ GV.setDSOLocal(true);
+ }
+ }
+
bool DoPromote = false;
if (GV.hasLocalLinkage() &&
((DoPromote = shouldPromoteLocalToGlobal(&GV)) || isPerformingImport())) {
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 8c643c93ec4..89dbe4b8fda 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1362,16 +1362,25 @@ void llvm::salvageDebugInfo(Instruction &I) {
SmallVector<DbgValueInst *, 1> DbgValues;
auto &M = *I.getModule();
- auto MDWrap = [&](Value *V) {
+ auto wrapMD = [&](Value *V) {
return MetadataAsValue::get(I.getContext(), ValueAsMetadata::get(V));
};
- if (isa<BitCastInst>(&I)) {
+ auto applyOffset = [&](DbgValueInst *DVI, uint64_t Offset) {
+ auto *DIExpr = DVI->getExpression();
+ DIExpr = DIExpression::prepend(DIExpr, DIExpression::NoDeref, Offset,
+ DIExpression::WithStackValue);
+ DVI->setOperand(0, wrapMD(I.getOperand(0)));
+ DVI->setOperand(2, MetadataAsValue::get(I.getContext(), DIExpr));
+ DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
+ };
+
+ if (isa<BitCastInst>(&I) || isa<IntToPtrInst>(&I)) {
findDbgValues(DbgValues, &I);
for (auto *DVI : DbgValues) {
// Bitcasts are entirely irrelevant for debug info. Rewrite the dbg.value
// to use the cast's source.
- DVI->setOperand(0, MDWrap(I.getOperand(0)));
+ DVI->setOperand(0, wrapMD(I.getOperand(0)));
DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
}
} else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
@@ -1383,24 +1392,26 @@ void llvm::salvageDebugInfo(Instruction &I) {
// Rewrite a constant GEP into a DIExpression. Since we are performing
// arithmetic to compute the variable's *value* in the DIExpression, we
// need to mark the expression with a DW_OP_stack_value.
- if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset)) {
- auto *DIExpr = DVI->getExpression();
+ if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset))
// GEP offsets are i32 and thus always fit into an int64_t.
- DIExpr = DIExpression::prepend(DIExpr, DIExpression::NoDeref,
- Offset.getSExtValue(),
- DIExpression::WithStackValue);
- DVI->setOperand(0, MDWrap(I.getOperand(0)));
- DVI->setOperand(2, MetadataAsValue::get(I.getContext(), DIExpr));
- DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
- }
+ applyOffset(DVI, Offset.getSExtValue());
}
+ } else if (auto *BI = dyn_cast<BinaryOperator>(&I)) {
+ if (BI->getOpcode() == Instruction::Add)
+ if (auto *ConstInt = dyn_cast<ConstantInt>(I.getOperand(1)))
+ if (ConstInt->getBitWidth() <= 64) {
+ APInt Offset = ConstInt->getValue();
+ findDbgValues(DbgValues, &I);
+ for (auto *DVI : DbgValues)
+ applyOffset(DVI, Offset.getSExtValue());
+ }
} else if (isa<LoadInst>(&I)) {
findDbgValues(DbgValues, &I);
for (auto *DVI : DbgValues) {
// Rewrite the load into DW_OP_deref.
auto *DIExpr = DVI->getExpression();
DIExpr = DIExpression::prepend(DIExpr, DIExpression::WithDeref);
- DVI->setOperand(0, MDWrap(I.getOperand(0)));
+ DVI->setOperand(0, wrapMD(I.getOperand(0)));
DVI->setOperand(2, MetadataAsValue::get(I.getContext(), DIExpr));
DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
}
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index 13c0bfbcb2e..0de6924e635 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -432,7 +432,7 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
InstDesc &Prev, bool HasFunNoNaNAttr) {
bool FP = I->getType()->isFloatingPointTy();
Instruction *UAI = Prev.getUnsafeAlgebraInst();
- if (!UAI && FP && !I->hasUnsafeAlgebra())
+ if (!UAI && FP && !I->isFast())
UAI = I; // Found an unsafe (unvectorizable) algebra instruction.
switch (I->getOpcode()) {
@@ -660,11 +660,11 @@ Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder,
break;
}
- // We only match FP sequences with unsafe algebra, so we can unconditionally
+ // We only match FP sequences that are 'fast', so we can unconditionally
// set it on any generated instructions.
IRBuilder<>::FastMathFlagGuard FMFG(Builder);
FastMathFlags FMF;
- FMF.setUnsafeAlgebra();
+ FMF.setFast();
Builder.setFastMathFlags(FMF);
Value *Cmp;
@@ -768,7 +768,7 @@ Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index,
// Floating point operations had to be 'fast' to enable the induction.
FastMathFlags Flags;
- Flags.setUnsafeAlgebra();
+ Flags.setFast();
Value *MulExp = B.CreateFMul(StepValue, Index);
if (isa<Instruction>(MulExp))
@@ -1338,7 +1338,7 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
static Value *addFastMathFlag(Value *V) {
if (isa<FPMathOperator>(V)) {
FastMathFlags Flags;
- Flags.setUnsafeAlgebra();
+ Flags.setFast();
cast<Instruction>(V)->setFastMathFlags(Flags);
}
return V;
@@ -1401,7 +1401,7 @@ Value *llvm::createSimpleTargetReduction(
RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid;
// TODO: Support creating ordered reductions.
FastMathFlags FMFUnsafe;
- FMFUnsafe.setUnsafeAlgebra();
+ FMFUnsafe.setFast();
switch (Opcode) {
case Instruction::Add:
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 3c4dae92ebf..e0045e9f48a 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2901,7 +2901,9 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
else
return false;
}
- return N <= PHINodeFoldingThreshold;
+ // The store we want to merge is counted in N, so add 1 to make sure
+ // we're counting the instructions that would be left.
+ return N <= (PHINodeFoldingThreshold + 1);
};
if (!MergeCondStoresAggressively &&
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 33117659489..a29b83717f3 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1111,7 +1111,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
// Example: x = 1000, y = 0.001.
// pow(exp(x), y) = pow(inf, 0.001) = inf, whereas exp(x*y) = exp(1).
auto *OpC = dyn_cast<CallInst>(Op1);
- if (OpC && OpC->hasUnsafeAlgebra() && CI->hasUnsafeAlgebra()) {
+ if (OpC && OpC->isFast() && CI->isFast()) {
LibFunc Func;
Function *OpCCallee = OpC->getCalledFunction();
if (OpCCallee && TLI->getLibFunc(OpCCallee->getName(), Func) &&
@@ -1136,7 +1136,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
LibFunc_sqrtl)) {
// If -ffast-math:
// pow(x, -0.5) -> 1.0 / sqrt(x)
- if (CI->hasUnsafeAlgebra()) {
+ if (CI->isFast()) {
IRBuilder<>::FastMathFlagGuard Guard(B);
B.setFastMathFlags(CI->getFastMathFlags());
@@ -1157,7 +1157,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
LibFunc_sqrtl)) {
// In -ffast-math, pow(x, 0.5) -> sqrt(x).
- if (CI->hasUnsafeAlgebra()) {
+ if (CI->isFast()) {
IRBuilder<>::FastMathFlagGuard Guard(B);
B.setFastMathFlags(CI->getFastMathFlags());
@@ -1196,7 +1196,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip");
// In -ffast-math, generate repeated fmul instead of generating pow(x, n).
- if (CI->hasUnsafeAlgebra()) {
+ if (CI->isFast()) {
APFloat V = abs(Op2C->getValueAPF());
// We limit to a max of 7 fmul(s). Thus max exponent is 32.
// This transformation applies to integer exponents only.
@@ -1284,9 +1284,9 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
IRBuilder<>::FastMathFlagGuard Guard(B);
FastMathFlags FMF;
- if (CI->hasUnsafeAlgebra()) {
- // Unsafe algebra sets all fast-math-flags to true.
- FMF.setUnsafeAlgebra();
+ if (CI->isFast()) {
+ // If the call is 'fast', then anything we create here will also be 'fast'.
+ FMF.setFast();
} else {
// At a minimum, no-nans-fp-math must be true.
if (!CI->hasNoNaNs())
@@ -1317,13 +1317,13 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
if (UnsafeFPShrink && hasFloatVersion(Name))
Ret = optimizeUnaryDoubleFP(CI, B, true);
- if (!CI->hasUnsafeAlgebra())
+ if (!CI->isFast())
return Ret;
Value *Op1 = CI->getArgOperand(0);
auto *OpC = dyn_cast<CallInst>(Op1);
- // The earlier call must also be unsafe in order to do these transforms.
- if (!OpC || !OpC->hasUnsafeAlgebra())
+ // The earlier call must also be 'fast' in order to do these transforms.
+ if (!OpC || !OpC->isFast())
return Ret;
// log(pow(x,y)) -> y*log(x)
@@ -1333,7 +1333,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
IRBuilder<>::FastMathFlagGuard Guard(B);
FastMathFlags FMF;
- FMF.setUnsafeAlgebra();
+ FMF.setFast();
B.setFastMathFlags(FMF);
LibFunc Func;
@@ -1365,11 +1365,11 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
Callee->getIntrinsicID() == Intrinsic::sqrt))
Ret = optimizeUnaryDoubleFP(CI, B, true);
- if (!CI->hasUnsafeAlgebra())
+ if (!CI->isFast())
return Ret;
Instruction *I = dyn_cast<Instruction>(CI->getArgOperand(0));
- if (!I || I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra())
+ if (!I || I->getOpcode() != Instruction::FMul || !I->isFast())
return Ret;
// We're looking for a repeated factor in a multiplication tree,
@@ -1391,8 +1391,7 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
Value *OtherMul0, *OtherMul1;
if (match(Op0, m_FMul(m_Value(OtherMul0), m_Value(OtherMul1)))) {
// Pattern: sqrt((x * y) * z)
- if (OtherMul0 == OtherMul1 &&
- cast<Instruction>(Op0)->hasUnsafeAlgebra()) {
+ if (OtherMul0 == OtherMul1 && cast<Instruction>(Op0)->isFast()) {
// Matched: sqrt((x * x) * z)
RepeatOp = OtherMul0;
OtherOp = Op1;
@@ -1437,8 +1436,8 @@ Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) {
if (!OpC)
return Ret;
- // Both calls must allow unsafe optimizations in order to remove them.
- if (!CI->hasUnsafeAlgebra() || !OpC->hasUnsafeAlgebra())
+ // Both calls must be 'fast' in order to remove them.
+ if (!CI->isFast() || !OpC->isFast())
return Ret;
// tan(atan(x)) -> x
@@ -2167,10 +2166,10 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
// Command-line parameter overrides instruction attribute.
// This can't be moved to optimizeFloatingPointLibCall() because it may be
- // used by the intrinsic optimizations.
+ // used by the intrinsic optimizations.
if (EnableUnsafeFPShrink.getNumOccurrences() > 0)
UnsafeFPShrink = EnableUnsafeFPShrink;
- else if (isa<FPMathOperator>(CI) && CI->hasUnsafeAlgebra())
+ else if (isa<FPMathOperator>(CI) && CI->isFast())
UnsafeFPShrink = true;
// First, check for intrinsics.
diff --git a/lib/Transforms/Utils/SplitModule.cpp b/lib/Transforms/Utils/SplitModule.cpp
index 07157069518..934a1bd73c2 100644
--- a/lib/Transforms/Utils/SplitModule.cpp
+++ b/lib/Transforms/Utils/SplitModule.cpp
@@ -141,15 +141,15 @@ static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap,
}
if (GV.hasLocalLinkage())
- addAllGlobalValueUsers(GVtoClusterMap, &GV, &GV);
- };
-
- std::for_each(M->begin(), M->end(), recordGVSet);
- std::for_each(M->global_begin(), M->global_end(), recordGVSet);
- std::for_each(M->alias_begin(), M->alias_end(), recordGVSet);
-
- // Assigned all GVs to merged clusters while balancing number of objects in
- // each.
+ addAllGlobalValueUsers(GVtoClusterMap, &GV, &GV);
+ };
+
+ llvm::for_each(M->functions(), recordGVSet);
+ llvm::for_each(M->globals(), recordGVSet);
+ llvm::for_each(M->aliases(), recordGVSet);
+
+ // Assigned all GVs to merged clusters while balancing number of objects in
+ // each.
auto CompareClusters = [](const std::pair<unsigned, unsigned> &a,
const std::pair<unsigned, unsigned> &b) {
if (a.second || b.second)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ca2f5a178e0..ed29ca0b573 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -385,7 +385,7 @@ static unsigned getReciprocalPredBlockProb() { return 2; }
static Value *addFastMathFlag(Value *V) {
if (isa<FPMathOperator>(V)) {
FastMathFlags Flags;
- Flags.setUnsafeAlgebra();
+ Flags.setFast();
cast<Instruction>(V)->setFastMathFlags(Flags);
}
return V;
@@ -2720,7 +2720,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
// Floating point operations had to be 'fast' to enable the induction.
FastMathFlags Flags;
- Flags.setUnsafeAlgebra();
+ Flags.setFast();
Value *MulOp = Builder.CreateFMul(Cv, Step);
if (isa<Instruction>(MulOp))
@@ -5396,7 +5396,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// operations, shuffles, or casts, as they don't change precision or
// semantics.
} else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
- !I.hasUnsafeAlgebra()) {
+ !I.isFast()) {
DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
Hints->setPotentiallyUnsafe();
}
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5dcf5528ac9..4232252af36 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4880,7 +4880,7 @@ class HorizontalReduction {
case RK_Min:
case RK_Max:
return Opcode == Instruction::ICmp ||
- cast<Instruction>(I->getOperand(0))->hasUnsafeAlgebra();
+ cast<Instruction>(I->getOperand(0))->isFast();
case RK_UMin:
case RK_UMax:
assert(Opcode == Instruction::ICmp &&
@@ -5232,7 +5232,7 @@ public:
Value *VectorizedTree = nullptr;
IRBuilder<> Builder(ReductionRoot);
FastMathFlags Unsafe;
- Unsafe.setUnsafeAlgebra();
+ Unsafe.setFast();
Builder.setFastMathFlags(Unsafe);
unsigned i = 0;
diff --git a/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll b/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll
new file mode 100644
index 00000000000..0a580276d95
--- /dev/null
+++ b/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll
@@ -0,0 +1,208 @@
+; RUN: opt < %s -analyze -block-freq | FileCheck %s
+; RUN: opt < %s -passes='print<block-freq>' -disable-output 2>&1 | FileCheck %s
+
+; Function Attrs: noinline norecurse nounwind readnone uwtable
+define i32 @_Z11irreducibleii(i32 %iter_outer, i32 %iter_inner) local_unnamed_addr !prof !27 {
+entry:
+ %cmp24 = icmp sgt i32 %iter_outer, 0
+ br i1 %cmp24, label %for.body, label %entry.for.cond.cleanup_crit_edge, !prof !28
+
+entry.for.cond.cleanup_crit_edge: ; preds = %entry
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.end, %entry.for.cond.cleanup_crit_edge
+ %sum.0.lcssa = phi i32 [ 0, %entry.for.cond.cleanup_crit_edge ], [ %sum.1, %for.end ]
+ ret i32 %sum.0.lcssa
+
+for.body: ; preds = %for.end, %entry
+ %k.026 = phi i32 [ %inc12, %for.end ], [ 0, %entry ]
+ %sum.025 = phi i32 [ %sum.1, %for.end ], [ 0, %entry ]
+ %rem23 = and i32 %k.026, 1
+ %cmp1 = icmp eq i32 %rem23, 0
+ br i1 %cmp1, label %entry8, label %for.cond2, !prof !29
+
+for.cond2: ; preds = %if.end9, %for.body
+ %sum.1 = phi i32 [ %add10, %if.end9 ], [ %sum.025, %for.body ]
+ %i.0 = phi i32 [ %inc, %if.end9 ], [ 0, %for.body ]
+ %cmp3 = icmp slt i32 %i.0, %iter_inner
+ br i1 %cmp3, label %for.body4, label %for.end, !prof !30, !irr_loop !31
+
+for.body4: ; preds = %for.cond2
+ %rem5 = srem i32 %k.026, 3
+ %cmp6 = icmp eq i32 %rem5, 0
+ br i1 %cmp6, label %entry8, label %if.end9, !prof !32
+
+entry8: ; preds = %for.body4, %for.body
+ %sum.2 = phi i32 [ %sum.025, %for.body ], [ %sum.1, %for.body4 ]
+ %i.1 = phi i32 [ 0, %for.body ], [ %i.0, %for.body4 ]
+ %add = add nsw i32 %sum.2, 4
+ br label %if.end9, !irr_loop !33
+
+if.end9: ; preds = %entry8, %for.body4
+ %sum.3 = phi i32 [ %add, %entry8 ], [ %sum.1, %for.body4 ]
+ %i.2 = phi i32 [ %i.1, %entry8 ], [ %i.0, %for.body4 ]
+ %add10 = add nsw i32 %sum.3, 1
+ %inc = add nsw i32 %i.2, 1
+ br label %for.cond2, !irr_loop !34
+
+for.end: ; preds = %for.cond2
+ %inc12 = add nuw nsw i32 %k.026, 1
+ %exitcond = icmp eq i32 %inc12, %iter_outer
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !35
+}
+
+!27 = !{!"function_entry_count", i64 1}
+!28 = !{!"branch_weights", i32 1, i32 0}
+!29 = !{!"branch_weights", i32 50, i32 50}
+!30 = !{!"branch_weights", i32 950, i32 100}
+!31 = !{!"loop_header_weight", i64 1050}
+!32 = !{!"branch_weights", i32 323, i32 627}
+!33 = !{!"loop_header_weight", i64 373}
+!34 = !{!"loop_header_weight", i64 1000}
+!35 = !{!"branch_weights", i32 1, i32 99}
+
+; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreducibleii':
+; CHECK-NEXT: block-frequency-info: _Z11irreducibleii
+; CHECK-NEXT: - entry: {{.*}} count = 1
+; CHECK-NEXT: - entry.for.cond.cleanup_crit_edge: {{.*}} count = 0
+; CHECK-NEXT: - for.cond.cleanup: {{.*}} count = 1
+; CHECK-NEXT: - for.body: {{.*}} count = 100
+; CHECK-NEXT: - for.cond2: {{.*}} count = 1050, irr_loop_header_weight = 1050
+; CHECK-NEXT: - for.body4: {{.*}} count = 950
+; CHECK-NEXT: - entry8: {{.*}} count = 373, irr_loop_header_weight = 373
+; CHECK-NEXT: - if.end9: {{.*}} count = 1000, irr_loop_header_weight = 1000
+; CHECK-NEXT: - for.end: {{.*}} count = 100
+
+@targets = local_unnamed_addr global [256 x i8*] zeroinitializer, align 16
+@tracing = local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: noinline norecurse nounwind uwtable
+define i32 @_Z11irreduciblePh(i8* nocapture readonly %p) !prof !27 {
+entry:
+ store <2 x i8*> <i8* blockaddress(@_Z11irreduciblePh, %sw.bb), i8* blockaddress(@_Z11irreduciblePh, %TARGET_1)>, <2 x i8*>* bitcast ([256 x i8*]* @targets to <2 x i8*>*), align 16
+ store i8* blockaddress(@_Z11irreduciblePh, %TARGET_2), i8** getelementptr inbounds ([256 x i8*], [256 x i8*]* @targets, i64 0, i64 2), align 16
+ %0 = load i32, i32* @tracing, align 4
+ %tobool = icmp eq i32 %0, 0
+ br label %for.cond1
+
+for.cond1: ; preds = %sw.default, %entry
+ %p.addr.0 = phi i8* [ %p, %entry ], [ %p.addr.4, %sw.default ]
+ %sum.0 = phi i32 [ 0, %entry ], [ %add25, %sw.default ]
+ %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1
+ %1 = load i8, i8* %p.addr.0, align 1
+ %incdec.ptr2 = getelementptr inbounds i8, i8* %p.addr.0, i64 2
+ %2 = load i8, i8* %incdec.ptr, align 1
+ %conv3 = zext i8 %2 to i32
+ br label %dispatch_op
+
+dispatch_op: ; preds = %sw.bb6, %for.cond1
+ %p.addr.1 = phi i8* [ %incdec.ptr2, %for.cond1 ], [ %p.addr.2, %sw.bb6 ]
+ %op.0 = phi i8 [ %1, %for.cond1 ], [ 1, %sw.bb6 ]
+ %oparg.0 = phi i32 [ %conv3, %for.cond1 ], [ %oparg.2, %sw.bb6 ]
+ %sum.1 = phi i32 [ %sum.0, %for.cond1 ], [ %add7, %sw.bb6 ]
+ switch i8 %op.0, label %sw.default [
+ i8 0, label %sw.bb
+ i8 1, label %dispatch_op.sw.bb6_crit_edge
+ i8 2, label %sw.bb15
+ ], !prof !36
+
+dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op
+ br label %sw.bb6
+
+sw.bb: ; preds = %indirectgoto, %dispatch_op
+ %oparg.1 = phi i32 [ %oparg.0, %dispatch_op ], [ 0, %indirectgoto ]
+ %sum.2 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %indirectgoto ]
+ %add.neg = sub i32 -5, %oparg.1
+ %sub = add i32 %add.neg, %sum.2
+ br label %exit
+
+TARGET_1: ; preds = %indirectgoto
+ %incdec.ptr4 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
+ %3 = load i8, i8* %p.addr.5, align 1
+ %conv5 = zext i8 %3 to i32
+ br label %sw.bb6
+
+sw.bb6: ; preds = %TARGET_1, %dispatch_op.sw.bb6_crit_edge
+ %p.addr.2 = phi i8* [ %incdec.ptr4, %TARGET_1 ], [ %p.addr.1, %dispatch_op.sw.bb6_crit_edge ]
+ %oparg.2 = phi i32 [ %conv5, %TARGET_1 ], [ %oparg.0, %dispatch_op.sw.bb6_crit_edge ]
+ %sum.3 = phi i32 [ %sum.7, %TARGET_1 ], [ %sum.1, %dispatch_op.sw.bb6_crit_edge ]
+ %mul = mul nsw i32 %oparg.2, 7
+ %add7 = add nsw i32 %sum.3, %mul
+ %rem46 = and i32 %add7, 1
+ %cmp8 = icmp eq i32 %rem46, 0
+ br i1 %cmp8, label %dispatch_op, label %if.then, !prof !37, !irr_loop !38
+
+if.then: ; preds = %sw.bb6
+ %mul9 = mul nsw i32 %add7, 9
+ br label %indirectgoto
+
+TARGET_2: ; preds = %indirectgoto
+ %incdec.ptr13 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
+ %4 = load i8, i8* %p.addr.5, align 1
+ %conv14 = zext i8 %4 to i32
+ br label %sw.bb15
+
+sw.bb15: ; preds = %TARGET_2, %dispatch_op
+ %p.addr.3 = phi i8* [ %p.addr.1, %dispatch_op ], [ %incdec.ptr13, %TARGET_2 ]
+ %oparg.3 = phi i32 [ %oparg.0, %dispatch_op ], [ %conv14, %TARGET_2 ]
+ %sum.4 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %TARGET_2 ]
+ %add16 = add nsw i32 %oparg.3, 3
+ %add17 = add nsw i32 %add16, %sum.4
+ br i1 %tobool, label %if.then18, label %exit, !prof !39, !irr_loop !40
+
+if.then18: ; preds = %sw.bb15
+ %idx.ext = sext i32 %oparg.3 to i64
+ %add.ptr = getelementptr inbounds i8, i8* %p.addr.3, i64 %idx.ext
+ %mul19 = mul nsw i32 %add17, 17
+ br label %indirectgoto
+
+unknown_op: ; preds = %indirectgoto
+ %sub24 = add nsw i32 %sum.7, -4
+ br label %sw.default
+
+sw.default: ; preds = %unknown_op, %dispatch_op
+ %p.addr.4 = phi i8* [ %p.addr.5, %unknown_op ], [ %p.addr.1, %dispatch_op ]
+ %sum.5 = phi i32 [ %sub24, %unknown_op ], [ %sum.1, %dispatch_op ]
+ %add25 = add nsw i32 %sum.5, 11
+ br label %for.cond1
+
+exit: ; preds = %sw.bb15, %sw.bb
+ %sum.6 = phi i32 [ %sub, %sw.bb ], [ %add17, %sw.bb15 ]
+ ret i32 %sum.6
+
+indirectgoto: ; preds = %if.then18, %if.then
+ %add.ptr.pn = phi i8* [ %add.ptr, %if.then18 ], [ %p.addr.2, %if.then ]
+ %sum.7 = phi i32 [ %mul19, %if.then18 ], [ %mul9, %if.then ]
+ %p.addr.5 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 1
+ %5 = load i8, i8* %add.ptr.pn, align 1
+ %idxprom21 = zext i8 %5 to i64
+ %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21
+ %6 = load i8*, i8** %arrayidx22, align 8
+ indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !41, !irr_loop !42
+}
+
+!36 = !{!"branch_weights", i32 0, i32 0, i32 201, i32 1}
+!37 = !{!"branch_weights", i32 201, i32 300}
+!38 = !{!"loop_header_weight", i64 501}
+!39 = !{!"branch_weights", i32 100, i32 0}
+!40 = !{!"loop_header_weight", i64 100}
+!41 = !{!"branch_weights", i32 0, i32 1, i32 300, i32 99}
+!42 = !{!"loop_header_weight", i64 400}
+
+; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreduciblePh':
+; CHECK-NEXT: block-frequency-info: _Z11irreduciblePh
+; CHECK-NEXT: - entry: {{.*}} count = 1
+; CHECK-NEXT: - for.cond1: {{.*}} count = 1
+; CHECK-NEXT: - dispatch_op: {{.*}} count = 201
+; CHECK-NEXT: - dispatch_op.sw.bb6_crit_edge: {{.*}} count = 200
+; CHECK-NEXT: - sw.bb: {{.*}} count = 0
+; CHECK-NEXT: - TARGET_1: {{.*}} count = 299
+; CHECK-NEXT: - sw.bb6: {{.*}} count = 500, irr_loop_header_weight = 501
+; CHECK-NEXT: - if.then: {{.*}} count = 299
+; CHECK-NEXT: - TARGET_2: {{.*}} count = 98
+; CHECK-NEXT: - sw.bb15: {{.*}} count = 99, irr_loop_header_weight = 100
+; CHECK-NEXT: - if.then18: {{.*}} count = 99
+; CHECK-NEXT: - unknown_op: {{.*}} count = 0
+; CHECK-NEXT: - sw.default: {{.*}} count = 0
+; CHECK-NEXT: - exit: {{.*}} count = 1
+; CHECK-NEXT: - indirectgoto: {{.*}} count = 399, irr_loop_header_weight = 400
diff --git a/test/Analysis/CostModel/X86/interleaved-load-float.ll b/test/Analysis/CostModel/X86/interleaved-load-float.ll
new file mode 100644
index 00000000000..373a55d7ad4
--- /dev/null
+++ b/test/Analysis/CostModel/X86/interleaved-load-float.ll
@@ -0,0 +1,141 @@
+; REQUIRES: asserts
+; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake %s 2>&1 | FileCheck %s
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+@src = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
+@dst = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
+
+; Function Attrs: norecurse nounwind
+define void @stride8(float %k, i32 %width_) {
+entry:
+
+; CHECK: Found an estimated cost of 48 for VF 8 For instruction: %0 = load float
+
+ %cmp72 = icmp sgt i32 %width_, 0
+ br i1 %cmp72, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+
+for.body: ; preds = %for.body.lr.ph, %for.body
+ %i.073 = phi i32 [ 0, %for.body.lr.ph ], [ %add46, %for.body ]
+ %arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.073
+ %0 = load float, float* %arrayidx, align 4
+ %mul = fmul fast float %0, %k
+ %arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.073
+ %1 = load float, float* %arrayidx2, align 4
+ %add3 = fadd fast float %1, %mul
+ store float %add3, float* %arrayidx2, align 4
+ %add4 = or i32 %i.073, 1
+ %arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
+ %2 = load float, float* %arrayidx5, align 4
+ %mul6 = fmul fast float %2, %k
+ %arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
+ %3 = load float, float* %arrayidx8, align 4
+ %add9 = fadd fast float %3, %mul6
+ store float %add9, float* %arrayidx8, align 4
+ %add10 = or i32 %i.073, 2
+ %arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
+ %4 = load float, float* %arrayidx11, align 4
+ %mul12 = fmul fast float %4, %k
+ %arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
+ %5 = load float, float* %arrayidx14, align 4
+ %add15 = fadd fast float %5, %mul12
+ store float %add15, float* %arrayidx14, align 4
+ %add16 = or i32 %i.073, 3
+ %arrayidx17 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add16
+ %6 = load float, float* %arrayidx17, align 4
+ %mul18 = fmul fast float %6, %k
+ %arrayidx20 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add16
+ %7 = load float, float* %arrayidx20, align 4
+ %add21 = fadd fast float %7, %mul18
+ store float %add21, float* %arrayidx20, align 4
+ %add22 = or i32 %i.073, 4
+ %arrayidx23 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add22
+ %8 = load float, float* %arrayidx23, align 4
+ %mul24 = fmul fast float %8, %k
+ %arrayidx26 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add22
+ %9 = load float, float* %arrayidx26, align 4
+ %add27 = fadd fast float %9, %mul24
+ store float %add27, float* %arrayidx26, align 4
+ %add28 = or i32 %i.073, 5
+ %arrayidx29 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add28
+ %10 = load float, float* %arrayidx29, align 4
+ %mul30 = fmul fast float %10, %k
+ %arrayidx32 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add28
+ %11 = load float, float* %arrayidx32, align 4
+ %add33 = fadd fast float %11, %mul30
+ store float %add33, float* %arrayidx32, align 4
+ %add34 = or i32 %i.073, 6
+ %arrayidx35 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add34
+ %12 = load float, float* %arrayidx35, align 4
+ %mul36 = fmul fast float %12, %k
+ %arrayidx38 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add34
+ %13 = load float, float* %arrayidx38, align 4
+ %add39 = fadd fast float %13, %mul36
+ store float %add39, float* %arrayidx38, align 4
+ %add40 = or i32 %i.073, 7
+ %arrayidx41 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add40
+ %14 = load float, float* %arrayidx41, align 4
+ %mul42 = fmul fast float %14, %k
+ %arrayidx44 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add40
+ %15 = load float, float* %arrayidx44, align 4
+ %add45 = fadd fast float %15, %mul42
+ store float %add45, float* %arrayidx44, align 4
+ %add46 = add nuw nsw i32 %i.073, 8
+ %cmp = icmp slt i32 %add46, %width_
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+; Function Attrs: norecurse nounwind
+define void @stride3(float %k, i32 %width_) {
+entry:
+
+; CHECK: Found an estimated cost of 20 for VF 8 For instruction: %0 = load float
+
+ %cmp27 = icmp sgt i32 %width_, 0
+ br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body.lr.ph, %for.body
+ %i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %add16, %for.body ]
+ %arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.028
+ %0 = load float, float* %arrayidx, align 4
+ %mul = fmul fast float %0, %k
+ %arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.028
+ %1 = load float, float* %arrayidx2, align 4
+ %add3 = fadd fast float %1, %mul
+ store float %add3, float* %arrayidx2, align 4
+ %add4 = add nuw nsw i32 %i.028, 1
+ %arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
+ %2 = load float, float* %arrayidx5, align 4
+ %mul6 = fmul fast float %2, %k
+ %arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
+ %3 = load float, float* %arrayidx8, align 4
+ %add9 = fadd fast float %3, %mul6
+ store float %add9, float* %arrayidx8, align 4
+ %add10 = add nuw nsw i32 %i.028, 2
+ %arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
+ %4 = load float, float* %arrayidx11, align 4
+ %mul12 = fmul fast float %4, %k
+ %arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
+ %5 = load float, float* %arrayidx14, align 4
+ %add15 = fadd fast float %5, %mul12
+ store float %add15, float* %arrayidx14, align 4
+ %add16 = add nuw nsw i32 %i.028, 3
+ %cmp = icmp slt i32 %add16, %width_
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
diff --git a/test/Assembler/fast-math-flags.ll b/test/Assembler/fast-math-flags.ll
index 4ef3607e1d0..664b1bd271e 100644
--- a/test/Assembler/fast-math-flags.ll
+++ b/test/Assembler/fast-math-flags.ll
@@ -7,6 +7,8 @@
@vec = external global <3 x float>
@arr = external global [3 x float]
+declare float @foo(float)
+
define float @none(float %x, float %y) {
entry:
; CHECK: %vec = load <3 x float>, <3 x float>* @vec
@@ -86,6 +88,28 @@ entry:
ret float %c
}
+; CHECK: @reassoc(
+define float @reassoc(float %x, float %y) {
+; CHECK: %a = fsub reassoc float %x, %y
+ %a = fsub reassoc float %x, %y
+; CHECK: %b = fmul reassoc float %x, %y
+ %b = fmul reassoc float %x, %y
+; CHECK: %c = call reassoc float @foo(float %b)
+ %c = call reassoc float @foo(float %b)
+ ret float %c
+}
+
+; CHECK: @afn(
+define float @afn(float %x, float %y) {
+; CHECK: %a = fdiv afn float %x, %y
+ %a = fdiv afn float %x, %y
+; CHECK: %b = frem afn float %x, %y
+ %b = frem afn float %x, %y
+; CHECK: %c = call afn float @foo(float %b)
+ %c = call afn float @foo(float %b)
+ ret float %c
+}
+
; CHECK: no_nan_inf
define float @no_nan_inf(float %x, float %y) {
entry:
@@ -130,10 +154,10 @@ entry:
; CHECK: %arr = load [3 x float], [3 x float]* @arr
%arr = load [3 x float], [3 x float]* @arr
-; CHECK: %a = fadd nnan ninf float %x, %y
- %a = fadd ninf nnan float %x, %y
-; CHECK: %a_vec = fadd nnan <3 x float> %vec, %vec
- %a_vec = fadd nnan <3 x float> %vec, %vec
+; CHECK: %a = fadd nnan ninf afn float %x, %y
+ %a = fadd ninf nnan afn float %x, %y
+; CHECK: %a_vec = fadd reassoc nnan <3 x float> %vec, %vec
+ %a_vec = fadd reassoc nnan <3 x float> %vec, %vec
; CHECK: %b = fsub fast float %x, %y
%b = fsub nnan nsz fast float %x, %y
; CHECK: %b_vec = fsub nnan <3 x float> %vec, %vec
diff --git a/test/Bitcode/compatibility-3.6.ll b/test/Bitcode/compatibility-3.6.ll
index e9313dfba87..6c47a853e24 100644
--- a/test/Bitcode/compatibility-3.6.ll
+++ b/test/Bitcode/compatibility-3.6.ll
@@ -612,7 +612,9 @@ define void @fastmathflags(float %op1, float %op2) {
%f.arcp = fadd arcp float %op1, %op2
; CHECK: %f.arcp = fadd arcp float %op1, %op2
%f.fast = fadd fast float %op1, %op2
- ; CHECK: %f.fast = fadd fast float %op1, %op2
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
+ ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2
ret void
}
diff --git a/test/Bitcode/compatibility-3.7.ll b/test/Bitcode/compatibility-3.7.ll
index 82fc9905535..55844e5c498 100644
--- a/test/Bitcode/compatibility-3.7.ll
+++ b/test/Bitcode/compatibility-3.7.ll
@@ -656,7 +656,9 @@ define void @fastmathflags(float %op1, float %op2) {
%f.arcp = fadd arcp float %op1, %op2
; CHECK: %f.arcp = fadd arcp float %op1, %op2
%f.fast = fadd fast float %op1, %op2
- ; CHECK: %f.fast = fadd fast float %op1, %op2
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
+ ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2
ret void
}
diff --git a/test/Bitcode/compatibility-3.8.ll b/test/Bitcode/compatibility-3.8.ll
index 2e70a380d10..a7fa20f2bc0 100644
--- a/test/Bitcode/compatibility-3.8.ll
+++ b/test/Bitcode/compatibility-3.8.ll
@@ -687,7 +687,9 @@ define void @fastmathflags(float %op1, float %op2) {
%f.arcp = fadd arcp float %op1, %op2
; CHECK: %f.arcp = fadd arcp float %op1, %op2
%f.fast = fadd fast float %op1, %op2
- ; CHECK: %f.fast = fadd fast float %op1, %op2
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
+ ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2
ret void
}
@@ -700,7 +702,9 @@ declare <4 x double> @fmf3()
; CHECK-LABEL: fastMathFlagsForCalls(
define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) {
%call.fast = call fast float @fmf1()
- ; CHECK: %call.fast = call fast float @fmf1()
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'aml' bits set, so this is not fully 'fast'.
+ ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp float @fmf1()
; Throw in some other attributes to make sure those stay in the right places.
diff --git a/test/Bitcode/compatibility-3.9.ll b/test/Bitcode/compatibility-3.9.ll
index 7c84daa7d3c..c456fefe9d4 100644
--- a/test/Bitcode/compatibility-3.9.ll
+++ b/test/Bitcode/compatibility-3.9.ll
@@ -758,7 +758,9 @@ define void @fastmathflags(float %op1, float %op2) {
%f.arcp = fadd arcp float %op1, %op2
; CHECK: %f.arcp = fadd arcp float %op1, %op2
%f.fast = fadd fast float %op1, %op2
- ; CHECK: %f.fast = fadd fast float %op1, %op2
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
+ ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2
ret void
}
@@ -771,7 +773,9 @@ declare <4 x double> @fmf3()
; CHECK-LABEL: fastMathFlagsForCalls(
define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) {
%call.fast = call fast float @fmf1()
- ; CHECK: %call.fast = call fast float @fmf1()
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
+ ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp float @fmf1()
; Throw in some other attributes to make sure those stay in the right places.
diff --git a/test/Bitcode/compatibility-4.0.ll b/test/Bitcode/compatibility-4.0.ll
index 9e34d48c95f..68446a7d5b0 100644
--- a/test/Bitcode/compatibility-4.0.ll
+++ b/test/Bitcode/compatibility-4.0.ll
@@ -757,8 +757,10 @@ define void @fastmathflags(float %op1, float %op2) {
; CHECK: %f.nsz = fadd nsz float %op1, %op2
%f.arcp = fadd arcp float %op1, %op2
; CHECK: %f.arcp = fadd arcp float %op1, %op2
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
%f.fast = fadd fast float %op1, %op2
- ; CHECK: %f.fast = fadd fast float %op1, %op2
+ ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2
ret void
}
@@ -771,7 +773,9 @@ declare <4 x double> @fmf3()
; CHECK-LABEL: fastMathFlagsForCalls(
define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) {
%call.fast = call fast float @fmf1()
- ; CHECK: %call.fast = call fast float @fmf1()
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
+ ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp float @fmf1()
; Throw in some other attributes to make sure those stay in the right places.
diff --git a/test/Bitcode/compatibility-5.0.ll b/test/Bitcode/compatibility-5.0.ll
index a4b3fca82b7..cdadc032d87 100644
--- a/test/Bitcode/compatibility-5.0.ll
+++ b/test/Bitcode/compatibility-5.0.ll
@@ -765,7 +765,9 @@ define void @fastmathflags(float %op1, float %op2) {
%f.contract = fadd contract float %op1, %op2
; CHECK: %f.contract = fadd contract float %op1, %op2
%f.fast = fadd fast float %op1, %op2
- ; CHECK: %f.fast = fadd fast float %op1, %op2
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'afn' bit set, so this is not fully 'fast'.
+ ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp contract float %op1, %op2
ret void
}
@@ -778,7 +780,9 @@ declare <4 x double> @fmf3()
; CHECK-LABEL: fastMathFlagsForCalls(
define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) {
%call.fast = call fast float @fmf1()
- ; CHECK: %call.fast = call fast float @fmf1()
+ ; 'fast' used to be its own bit, but this changed in Oct 2017.
+ ; The binary test file does not have the newer 'afn' bit set, so this is not fully 'fast'.
+ ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp contract float @fmf1()
; Throw in some other attributes to make sure those stay in the right places.
diff --git a/test/Bitcode/compatibility.ll b/test/Bitcode/compatibility.ll
index 7d4167f4cb0..0157fd438a7 100644
--- a/test/Bitcode/compatibility.ll
+++ b/test/Bitcode/compatibility.ll
@@ -775,6 +775,10 @@ define void @fastmathflags(float %op1, float %op2) {
; CHECK: %f.arcp = fadd arcp float %op1, %op2
%f.contract = fadd contract float %op1, %op2
; CHECK: %f.contract = fadd contract float %op1, %op2
+ %f.afn = fadd afn float %op1, %op2
+ ; CHECK: %f.afn = fadd afn float %op1, %op2
+ %f.reassoc = fadd reassoc float %op1, %op2
+ ; CHECK: %f.reassoc = fadd reassoc float %op1, %op2
%f.fast = fadd fast float %op1, %op2
; CHECK: %f.fast = fadd fast float %op1, %op2
ret void
diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll b/test/Bitcode/thinlto-summary-local-5.0.ll
new file mode 100644
index 00000000000..cbc48d23df3
--- /dev/null
+++ b/test/Bitcode/thinlto-summary-local-5.0.ll
@@ -0,0 +1,22 @@
+; Bitcode compatibility test for dso_local flag in thin-lto summaries.
+; Checks that older bitcode summaries without the dso_local op are still
+; properly parsed and don't set GlobalValues as dso_local.
+
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; RUN: llvm-bcanalyzer -dump %s.bc | FileCheck %s --check-prefix=BCAN
+
+define void @foo() {
+;CHECK-DAG:define void @foo()
+ ret void
+}
+
+@bar = global i32 0
+;CHECK-DAG: @bar = global i32 0
+
+@baz = alias i32, i32* @bar
+;CHECK-DAG: @bar = global i32 0
+
+;BCAN: <SOURCE_FILENAME
+;BCAN-NEXT: <GLOBALVAR {{.*}} op7=0/>
+;BCAN-NEXT: <FUNCTION {{.*}} op16=0/>
+;BCAN-NEXT: <ALIAS {{.*}} op9=0/>
diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll.bc b/test/Bitcode/thinlto-summary-local-5.0.ll.bc
new file mode 100644
index 00000000000..8dc7ca0a74b
--- /dev/null
+++ b/test/Bitcode/thinlto-summary-local-5.0.ll.bc
Binary files differ
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
index 25c0e78a7b2..4a4c3c58072 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@@ -167,3 +167,70 @@ end:
%vec = load <2 x i16*>, <2 x i16*>* undef
br label %block
}
+
+; FALLBACK-WITH-REPORT-ERR-G_IMPLICIT_DEF-LEGALIZABLE: (FIXME: this is what is expected once we can legalize non-pow-of-2 G_IMPLICIT_DEF) remark: <unknown>:0:0: unable to legalize instruction: %vreg1<def>(s96) = G_INSERT %vreg2, %vreg0, 0; (in function: nonpow2_insertvalue_narrowing
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg2<def>(s96) = G_IMPLICIT_DEF; (in function: nonpow2_insertvalue_narrowing
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_insertvalue_narrowing
+; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_insertvalue_narrowing:
+%struct96 = type { float, float, float }
+define void @nonpow2_insertvalue_narrowing(float %a) {
+ %dummy = insertvalue %struct96 undef, float %a, 0
+ ret void
+}
+
+; FALLBACK-WITH-REPORT-ERR remark: <unknown>:0:0: unable to legalize instruction: %vreg3<def>(s96) = G_ADD %vreg2, %vreg2; (in function: nonpow2_add_narrowing
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_add_narrowing
+; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_add_narrowing:
+define void @nonpow2_add_narrowing() {
+ %a = add i128 undef, undef
+ %b = trunc i128 %a to i96
+ %dummy = add i96 %b, %b
+ ret void
+}
+
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg3<def>(s96) = G_OR %vreg2, %vreg2; (in function: nonpow2_or_narrowing
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_or_narrowing
+; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_or_narrowing:
+define void @nonpow2_or_narrowing() {
+ %a = add i128 undef, undef
+ %b = trunc i128 %a to i96
+ %dummy = or i96 %b, %b
+ ret void
+}
+
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg0<def>(s96) = G_LOAD %vreg1; mem:LD12[undef](align=16) (in function: nonpow2_load_narrowing
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_load_narrowing
+; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_load_narrowing:
+define void @nonpow2_load_narrowing() {
+ %dummy = load i96, i96* undef
+ ret void
+}
+
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: G_STORE %vreg3, %vreg0; mem:ST12[%c](align=16) (in function: nonpow2_store_narrowing
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_store_narrowing
+; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_store_narrowing:
+define void @nonpow2_store_narrowing(i96* %c) {
+ %a = add i128 undef, undef
+ %b = trunc i128 %a to i96
+ store i96 %b, i96* %c
+ ret void
+}
+
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg0<def>(s96) = G_CONSTANT 0; (in function: nonpow2_constant_narrowing
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_constant_narrowing
+; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_constant_narrowing:
+define void @nonpow2_constant_narrowing() {
+ store i96 0, i96* undef
+ ret void
+}
+
+; Currently can't handle vector lengths that aren't an exact multiple of
+; natively supported vector lengths. Test that the fall-back works for those.
+; FALLBACK-WITH-REPORT-ERR-G_IMPLICIT_DEF-LEGALIZABLE: (FIXME: this is what is expected once we can legalize non-pow-of-2 G_IMPLICIT_DEF) remark: <unknown>:0:0: unable to legalize instruction: %vreg1<def>(<7 x s64>) = G_ADD %vreg0, %vreg0; (in function: nonpow2_vector_add_fewerelements
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg0<def>(<7 x s64>) = G_IMPLICIT_DEF; (in function: nonpow2_vector_add_fewerelements
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_vector_add_fewerelements
+; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_vector_add_fewerelements:
+define void @nonpow2_vector_add_fewerelements() {
+ %dummy = add <7 x i64> undef, undef
+ ret void
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir
index 4042047dfc2..cc158a29c3e 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir
@@ -92,6 +92,10 @@
store double %vres, double* %addr
ret void
}
+
+ define void @fp16Ext32() { ret void }
+ define void @fp16Ext64() { ret void }
+ define void @fp32Ext64() { ret void }
...
---
@@ -742,3 +746,103 @@ body: |
RET_ReallyLR
...
+
+---
+# Make sure we map FPEXT on FPR register bank.
+# CHECK-LABEL: name: fp16Ext32
+name: fp16Ext32
+alignment: 2
+legalized: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 2, class: fpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 3, class: fpr, preferred-register: '' }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+# CHECK: %1:gpr(s32) = COPY %w0
+# CHECK-NEXT: %0:gpr(s16) = G_TRUNC %1
+# %0 has been mapped to GPR, we need to repair to match FPR.
+# CHECK-NEXT: %3:fpr(s16) = COPY %0
+# CHECK-NEXT: %2:fpr(s32) = G_FPEXT %3
+# CHECK-NEXT: %s0 = COPY %2
+# CHECK-NEXT: RET_ReallyLR
+
+body: |
+ bb.1:
+ liveins: %w0
+
+ %1(s32) = COPY %w0
+ %0(s16) = G_TRUNC %1(s32)
+ %2(s32) = G_FPEXT %0(s16)
+ %s0 = COPY %2(s32)
+ RET_ReallyLR implicit %s0
+
+...
+
+---
+# Make sure we map FPEXT on FPR register bank.
+# CHECK-LABEL: name: fp16Ext64
+name: fp16Ext64
+alignment: 2
+legalized: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 2, class: fpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 3, class: fpr, preferred-register: '' }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+# CHECK: %1:gpr(s32) = COPY %w0
+# CHECK-NEXT: %0:gpr(s16) = G_TRUNC %1
+# %0 has been mapped to GPR, we need to repair to match FPR.
+# CHECK-NEXT: %3:fpr(s16) = COPY %0
+# CHECK-NEXT: %2:fpr(s64) = G_FPEXT %3
+# CHECK-NEXT: %d0 = COPY %2
+# CHECK-NEXT: RET_ReallyLR
+
+body: |
+ bb.1:
+ liveins: %w0
+
+ %1(s32) = COPY %w0
+ %0(s16) = G_TRUNC %1(s32)
+ %2(s64) = G_FPEXT %0(s16)
+ %d0 = COPY %2(s64)
+ RET_ReallyLR implicit %d0
+
+...
+
+---
+# Make sure we map FPEXT on FPR register bank.
+# CHECK-LABEL: name: fp32Ext64
+name: fp32Ext64
+alignment: 2
+legalized: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: fpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 2, class: fpr, preferred-register: '' }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+# CHECK: %0:gpr(s32) = COPY %w0
+# %0 has been mapped to GPR, we need to repair to match FPR.
+# CHECK-NEXT: %2:fpr(s32) = COPY %0
+# CHECK-NEXT: %1:fpr(s64) = G_FPEXT %2
+# CHECK-NEXT: %d0 = COPY %1
+# CHECK-NEXT: RET_ReallyLR
+body: |
+ bb.1:
+ liveins: %w0
+
+ %0(s32) = COPY %w0
+ %1(s64) = G_FPEXT %0(s32)
+ %d0 = COPY %1(s64)
+ RET_ReallyLR implicit %d0
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
index fa6727da1bb..20449c53a59 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
@@ -8,6 +8,10 @@
entry:
ret void
}
+ define void @test_scalar_add_big_nonpow2() {
+ entry:
+ ret void
+ }
define void @test_scalar_add_small() {
entry:
ret void
@@ -16,6 +20,10 @@
entry:
ret void
}
+ define void @test_vector_add_nonpow2() {
+ entry:
+ ret void
+ }
...
---
@@ -58,6 +66,49 @@ body: |
...
---
+name: test_scalar_add_big_nonpow2
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+ - { id: 8, class: _ }
+ - { id: 9, class: _ }
+body: |
+ bb.0.entry:
+ liveins: %x0, %x1, %x2, %x3
+ ; CHECK-LABEL: name: test_scalar_add_big_nonpow2
+ ; CHECK-NOT: G_MERGE_VALUES
+ ; CHECK-NOT: G_UNMERGE_VALUES
+ ; CHECK-DAG: [[CARRY0_32:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-DAG: [[CARRY0:%[0-9]+]]:_(s1) = G_TRUNC [[CARRY0_32]]
+ ; CHECK: [[RES_LO:%[0-9]+]]:_(s64), [[CARRY1:%[0-9]+]]:_(s1) = G_UADDE %0, %1, [[CARRY0]]
+ ; CHECK: [[RES_MI:%[0-9]+]]:_(s64), [[CARRY2:%[0-9]+]]:_(s1) = G_UADDE %1, %2, [[CARRY1]]
+ ; CHECK: [[RES_HI:%[0-9]+]]:_(s64), {{%.*}}(s1) = G_UADDE %2, %3, [[CARRY2]]
+ ; CHECK-NOT: G_MERGE_VALUES
+ ; CHECK-NOT: G_UNMERGE_VALUES
+ ; CHECK: %x0 = COPY [[RES_LO]]
+ ; CHECK: %x1 = COPY [[RES_MI]]
+ ; CHECK: %x2 = COPY [[RES_HI]]
+
+ %0(s64) = COPY %x0
+ %1(s64) = COPY %x1
+ %2(s64) = COPY %x2
+ %3(s64) = COPY %x3
+ %4(s192) = G_MERGE_VALUES %0, %1, %2
+ %5(s192) = G_MERGE_VALUES %1, %2, %3
+ %6(s192) = G_ADD %4, %5
+ %7(s64), %8(s64), %9(s64) = G_UNMERGE_VALUES %6
+ %x0 = COPY %7
+ %x1 = COPY %8
+ %x2 = COPY %9
+...
+
+---
name: test_scalar_add_small
registers:
- { id: 0, class: _ }
@@ -124,3 +175,43 @@ body: |
%q0 = COPY %7
%q1 = COPY %8
...
+---
+name: test_vector_add_nonpow2
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+ - { id: 8, class: _ }
+ - { id: 9, class: _ }
+body: |
+ bb.0.entry:
+ liveins: %q0, %q1, %q2, %q3
+ ; CHECK-LABEL: name: test_vector_add_nonpow2
+ ; CHECK-NOT: G_EXTRACT
+ ; CHECK-NOT: G_SEQUENCE
+ ; CHECK: [[RES_LO:%[0-9]+]]:_(<2 x s64>) = G_ADD %0, %1
+ ; CHECK: [[RES_MI:%[0-9]+]]:_(<2 x s64>) = G_ADD %1, %2
+ ; CHECK: [[RES_HI:%[0-9]+]]:_(<2 x s64>) = G_ADD %2, %3
+ ; CHECK-NOT: G_EXTRACT
+ ; CHECK-NOT: G_SEQUENCE
+ ; CHECK: %q0 = COPY [[RES_LO]]
+ ; CHECK: %q1 = COPY [[RES_MI]]
+ ; CHECK: %q2 = COPY [[RES_HI]]
+
+ %0(<2 x s64>) = COPY %q0
+ %1(<2 x s64>) = COPY %q1
+ %2(<2 x s64>) = COPY %q2
+ %3(<2 x s64>) = COPY %q3
+ %4(<6 x s64>) = G_MERGE_VALUES %0, %1, %2
+ %5(<6 x s64>) = G_MERGE_VALUES %1, %2, %3
+ %6(<6 x s64>) = G_ADD %4, %5
+ %7(<2 x s64>), %8(<2 x s64>), %9(<2 x s64>) = G_UNMERGE_VALUES %6
+ %q0 = COPY %7
+ %q1 = COPY %8
+ %q2 = COPY %9
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir b/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir
index 7432b6761b7..405e6b54663 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir
@@ -9,6 +9,7 @@
define void @test_inserts_4() { ret void }
define void @test_inserts_5() { ret void }
define void @test_inserts_6() { ret void }
+ define void @test_inserts_nonpow2() { ret void }
...
---
@@ -141,3 +142,21 @@ body: |
%4:_(s128) = G_INSERT %3, %2, 32
RET_ReallyLR
...
+
+---
+name: test_inserts_nonpow2
+body: |
+ bb.0:
+ liveins: %x0, %x1, %x2
+
+
+ ; CHECK-LABEL: name: test_inserts_nonpow2
+ ; CHECK: %5:_(s192) = G_MERGE_VALUES %3(s64), %1(s64), %2(s64)
+ %0:_(s64) = COPY %x0
+ %1:_(s64) = COPY %x1
+ %2:_(s64) = COPY %x2
+ %3:_(s64) = COPY %x3
+ %4:_(s192) = G_MERGE_VALUES %0, %1, %2
+ %5:_(s192) = G_INSERT %4, %3, 0
+ RET_ReallyLR
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir b/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir
index c7b7ec9b6fe..33b48351106 100644
--- a/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir
+++ b/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir
@@ -15,11 +15,11 @@ body: |
%1:gpr(s64) = G_IMPLICIT_DEF
; CHECK: body:
- ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, 15
+ ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32
; CHECK: %2:gpr64 = BFMXri %1, [[TMP]], 0, 31
%2:gpr(s64) = G_INSERT %1, %0, 0
- ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, 15
+ ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32
; CHECK: %3:gpr64 = BFMXri %1, [[TMP]], 51, 31
%3:gpr(s64) = G_INSERT %1, %0, 13
diff --git a/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir b/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir
index 2c2e475a87a..bd75c4e661e 100644
--- a/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir
+++ b/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir
@@ -33,7 +33,7 @@ body: |
; CHECK-LABEL: name: anyext_s64_from_s32
; CHECK: [[COPY:%[0-9]+]]:gpr32all = COPY %w0
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64all = SUBREG_TO_REG 0, [[COPY]], 15
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64all = SUBREG_TO_REG 0, [[COPY]], %subreg.sub_32
; CHECK: [[COPY1:%[0-9]+]]:gpr64all = COPY [[SUBREG_TO_REG]]
; CHECK: %x0 = COPY [[COPY1]]
%0(s32) = COPY %w0
@@ -80,7 +80,7 @@ body: |
; CHECK-LABEL: name: zext_s64_from_s32
; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], 15
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.sub_32
; CHECK: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri [[SUBREG_TO_REG]], 0, 31
; CHECK: %x0 = COPY [[UBFMXri]]
%0(s32) = COPY %w0
@@ -177,7 +177,7 @@ body: |
; CHECK-LABEL: name: sext_s64_from_s32
; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], 15
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.sub_32
; CHECK: [[SBFMXri:%[0-9]+]]:gpr64 = SBFMXri [[SUBREG_TO_REG]], 0, 31
; CHECK: %x0 = COPY [[SBFMXri]]
%0(s32) = COPY %w0
diff --git a/test/CodeGen/AArch64/dwarf-cfi.ll b/test/CodeGen/AArch64/dwarf-cfi.ll
new file mode 100644
index 00000000000..a75bcd19c69
--- /dev/null
+++ b/test/CodeGen/AArch64/dwarf-cfi.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple aarch64-windows-gnu -filetype=asm -o - %s | FileCheck %s
+
+define void @_Z1gv() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+ invoke void @_Z1fv()
+ to label %try.cont unwind label %lpad
+
+lpad:
+ %0 = landingpad { i8*, i32 }
+ catch i8* null
+ %1 = extractvalue { i8*, i32 } %0, 0
+ %2 = tail call i8* @__cxa_begin_catch(i8* %1) #2
+ tail call void @__cxa_end_catch()
+ br label %try.cont
+
+try.cont:
+ ret void
+}
+
+declare void @_Z1fv()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+; CHECK-LABEL: _Z1gv:
+; CHECK: .cfi_startproc
+; CHECK: .cfi_personality 0, __gxx_personality_v0
+; CHECK: .cfi_lsda 0, .Lexception0
+; CHECK: str x30, [sp, #-16]!
+; CHECK: .cfi_def_cfa_offset 16
+; CHECK: .cfi_offset w30, -16
+; CHECK: ldr x30, [sp], #16
+; CHECK: .cfi_endproc
diff --git a/test/CodeGen/AArch64/recp-fastmath.ll b/test/CodeGen/AArch64/recp-fastmath.ll
index 38e0fb360e4..4776931cf06 100644
--- a/test/CodeGen/AArch64/recp-fastmath.ll
+++ b/test/CodeGen/AArch64/recp-fastmath.ll
@@ -18,6 +18,8 @@ define float @frecp1(float %x) #1 {
; CHECK-NEXT: BB#0
; CHECK-NEXT: frecpe [[R:s[0-7]]]
; CHECK-NEXT: frecps {{s[0-7](, s[0-7])?}}, [[R]]
+; CHECK: frecps {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}}
+; CHECK-NOT: frecps {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}}
}
define <2 x float> @f2recp0(<2 x float> %x) #0 {
@@ -38,6 +40,8 @@ define <2 x float> @f2recp1(<2 x float> %x) #1 {
; CHECK-NEXT: BB#0
; CHECK-NEXT: frecpe [[R:v[0-7]\.2s]]
; CHECK-NEXT: frecps {{v[0-7]\.2s(, v[0-7].2s)?}}, [[R]]
+; CHECK: frecps {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}}
+; CHECK-NOT: frecps {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}}
}
define <4 x float> @f4recp0(<4 x float> %x) #0 {
@@ -58,6 +62,8 @@ define <4 x float> @f4recp1(<4 x float> %x) #1 {
; CHECK-NEXT: BB#0
; CHECK-NEXT: frecpe [[R:v[0-7]\.4s]]
; CHECK-NEXT: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[R]]
+; CHECK: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
}
define <8 x float> @f8recp0(<8 x float> %x) #0 {
@@ -77,10 +83,12 @@ define <8 x float> @f8recp1(<8 x float> %x) #1 {
; CHECK-LABEL: f8recp1:
; CHECK-NEXT: BB#0
-; CHECK-NEXT: frecpe [[RA:v[0-7]\.4s]]
-; CHECK-NEXT: frecpe [[RB:v[0-7]\.4s]]
-; CHECK-NEXT: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[RA]]
-; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[RB]]
+; CHECK-NEXT: frecpe [[R:v[0-7]\.4s]]
+; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[R]]
+; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, {{v[0-7]\.4s}}
+; CHECK: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
}
define double @drecp0(double %x) #0 {
@@ -101,6 +109,9 @@ define double @drecp1(double %x) #1 {
; CHECK-NEXT: BB#0
; CHECK-NEXT: frecpe [[R:d[0-7]]]
; CHECK-NEXT: frecps {{d[0-7](, d[0-7])?}}, [[R]]
+; CHECK: frecps {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
+; CHECK: frecps {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
+; CHECK-NOT: frecps {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
}
define <2 x double> @d2recp0(<2 x double> %x) #0 {
@@ -121,6 +132,9 @@ define <2 x double> @d2recp1(<2 x double> %x) #1 {
; CHECK-NEXT: BB#0
; CHECK-NEXT: frecpe [[R:v[0-7]\.2d]]
; CHECK-NEXT: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[R]]
+; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
}
define <4 x double> @d4recp0(<4 x double> %x) #0 {
@@ -140,10 +154,14 @@ define <4 x double> @d4recp1(<4 x double> %x) #1 {
; CHECK-LABEL: d4recp1:
; CHECK-NEXT: BB#0
-; CHECK-NEXT: frecpe [[RA:v[0-7]\.2d]]
-; CHECK-NEXT: frecpe [[RB:v[0-7]\.2d]]
-; CHECK-NEXT: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[RA]]
-; CHECK: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[RB]]
+; CHECK-NEXT: frecpe [[R:v[0-7]\.2d]]
+; CHECK: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[R]]
+; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
}
attributes #0 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AArch64/sqrt-fastmath.ll b/test/CodeGen/AArch64/sqrt-fastmath.ll
index 079562c0581..4dd0516faf0 100644
--- a/test/CodeGen/AArch64/sqrt-fastmath.ll
+++ b/test/CodeGen/AArch64/sqrt-fastmath.ll
@@ -22,7 +22,9 @@ define float @fsqrt(float %a) #0 {
; CHECK-NEXT: frsqrte [[RA:s[0-7]]]
; CHECK-NEXT: fmul [[RB:s[0-7]]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{s[0-7](, s[0-7])?}}, [[RB]]
-; CHECK: fcmp s0, #0
+; CHECK: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}}
+; CHECK-NOT: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}}
+; CHECK: fcmp {{s[0-7]}}, #0
}
define <2 x float> @f2sqrt(<2 x float> %a) #0 {
@@ -38,7 +40,9 @@ define <2 x float> @f2sqrt(<2 x float> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2s]]
; CHECK-NEXT: fmul [[RB:v[0-7]\.2s]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{v[0-7]\.2s(, v[0-7]\.2s)?}}, [[RB]]
-; CHECK: fcmeq {{v[0-7]\.2s, v0\.2s}}, #0
+; CHECK: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}}
+; CHECK-NOT: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}}
+; CHECK: fcmeq {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, #0
}
define <4 x float> @f4sqrt(<4 x float> %a) #0 {
@@ -54,7 +58,9 @@ define <4 x float> @f4sqrt(<4 x float> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]]
; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]]
-; CHECK: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0
+; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0
}
define <8 x float> @f8sqrt(<8 x float> %a) #0 {
@@ -69,9 +75,16 @@ define <8 x float> @f8sqrt(<8 x float> %a) #0 {
; CHECK-LABEL: f8sqrt:
; CHECK-NEXT: BB#0
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]]
-; CHECK: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]]
-; CHECK: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]]
-; CHECK: fcmeq {{v[0-7]\.4s, v[0-1]\.4s}}, #0
+; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]]
+; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]]
+; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0
+; CHECK: frsqrte [[RC:v[0-7]\.4s]]
+; CHECK-NEXT: fmul [[RD:v[0-7]\.4s]], [[RC]], [[RC]]
+; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RD]]
+; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0
}
define double @dsqrt(double %a) #0 {
@@ -87,7 +100,10 @@ define double @dsqrt(double %a) #0 {
; CHECK-NEXT: frsqrte [[RA:d[0-7]]]
; CHECK-NEXT: fmul [[RB:d[0-7]]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{d[0-7](, d[0-7])?}}, [[RB]]
-; CHECK: fcmp d0, #0
+; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
+; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
+; CHECK-NOT: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
+; CHECK: fcmp {{d[0-7]}}, #0
}
define <2 x double> @d2sqrt(<2 x double> %a) #0 {
@@ -103,7 +119,10 @@ define <2 x double> @d2sqrt(<2 x double> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]]
; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]]
-; CHECK: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0
}
define <4 x double> @d4sqrt(<4 x double> %a) #0 {
@@ -118,9 +137,19 @@ define <4 x double> @d4sqrt(<4 x double> %a) #0 {
; CHECK-LABEL: d4sqrt:
; CHECK-NEXT: BB#0
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]]
-; CHECK: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]]
-; CHECK: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]]
-; CHECK: fcmeq {{v[0-7]\.2d, v[0-1]\.2d}}, #0
+; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]]
+; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]]
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0
+; CHECK: frsqrte [[RC:v[0-7]\.2d]]
+; CHECK-NEXT: fmul [[RD:v[0-7]\.2d]], [[RC]], [[RC]]
+; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RD]]
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0
}
define float @frsqrt(float %a) #0 {
@@ -137,6 +166,8 @@ define float @frsqrt(float %a) #0 {
; CHECK-NEXT: frsqrte [[RA:s[0-7]]]
; CHECK-NEXT: fmul [[RB:s[0-7]]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{s[0-7](, s[0-7])?}}, [[RB]]
+; CHECK: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}}
+; CHECK-NOT: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}}
; CHECK-NOT: fcmp {{s[0-7]}}, #0
}
@@ -154,7 +185,9 @@ define <2 x float> @f2rsqrt(<2 x float> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2s]]
; CHECK-NEXT: fmul [[RB:v[0-7]\.2s]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{v[0-7]\.2s(, v[0-7]\.2s)?}}, [[RB]]
-; CHECK-NOT: fcmeq {{v[0-7]\.2s, v0\.2s}}, #0
+; CHECK: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}}
+; CHECK-NOT: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}}
+; CHECK-NOT: fcmeq {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, #0
}
define <4 x float> @f4rsqrt(<4 x float> %a) #0 {
@@ -171,7 +204,9 @@ define <4 x float> @f4rsqrt(<4 x float> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]]
; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]]
-; CHECK-NOT: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0
+; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0
}
define <8 x float> @f8rsqrt(<8 x float> %a) #0 {
@@ -189,7 +224,11 @@ define <8 x float> @f8rsqrt(<8 x float> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]]
; CHECK: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]]
; CHECK: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]]
-; CHECK-NOT: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0
+; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}}
+; CHECK-NOT: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0
}
define double @drsqrt(double %a) #0 {
@@ -206,6 +245,9 @@ define double @drsqrt(double %a) #0 {
; CHECK-NEXT: frsqrte [[RA:d[0-7]]]
; CHECK-NEXT: fmul [[RB:d[0-7]]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{d[0-7](, d[0-7])?}}, [[RB]]
+; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
+; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
+; CHECK-NOT: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}}
; CHECK-NOT: fcmp d0, #0
}
@@ -223,7 +265,10 @@ define <2 x double> @d2rsqrt(<2 x double> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]]
; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]]
; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]]
-; CHECK-NOT: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0
}
define <4 x double> @d4rsqrt(<4 x double> %a) #0 {
@@ -241,7 +286,13 @@ define <4 x double> @d4rsqrt(<4 x double> %a) #0 {
; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]]
; CHECK: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]]
; CHECK: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]]
-; CHECK-NOT: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
+; CHECK-NOT: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0
}
attributes #0 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
index 4c05383615a..70e2b5e4ae2 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
@@ -44,28 +44,28 @@ regBankSelected: true
# Max immediate for CI
# SIVI: [[K_LO:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967292
# SIVI: [[K_HI:%[0-9]+]]:sreg_32 = S_MOV_B32 3
-# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2
+# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], %subreg.sub0, [[K_HI]], %subreg.sub1
# SIVI-DAG: [[K_SUB0:%[0-9]+]]:sgpr_32 = COPY [[K]].sub0
# SIVI-DAG: [[PTR_LO:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub0
# SIVI: [[ADD_PTR_LO:%[0-9]+]]:sreg_32 = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
# SIVI-DAG: [[K_SUB1:%[0-9]+]]:sgpr_32 = COPY [[K]].sub1
# SIVI-DAG: [[PTR_HI:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub1
# SIVI: [[ADD_PTR_HI:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]]
-# SIVI: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2
+# SIVI: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], %subreg.sub0, [[ADD_PTR_HI]], %subreg.sub1
# SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0
# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 4294967295, 0
# Immediate overflow for CI
# GCN: [[K_LO:%[0-9]+]]:sreg_32 = S_MOV_B32 0
# GCN: [[K_HI:%[0-9]+]]:sreg_32 = S_MOV_B32 4
-# GCN: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2
+# GCN: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], %subreg.sub0, [[K_HI]], %subreg.sub1
# GCN-DAG: [[K_SUB0:%[0-9]+]]:sgpr_32 = COPY [[K]].sub0
# GCN-DAG: [[PTR_LO:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub0
# GCN: [[ADD_PTR_LO:%[0-9]+]]:sreg_32 = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
# GCN-DAG: [[K_SUB1:%[0-9]+]]:sgpr_32 = COPY [[K]].sub1
# GCN-DAG: [[PTR_HI:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub1
# GCN: [[ADD_PTR_HI:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]]
-# GCN: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2
+# GCN: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], %subreg.sub0, [[ADD_PTR_HI]], %subreg.sub1
# GCN: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0
# Max 32-bit byte offset
@@ -76,14 +76,14 @@ regBankSelected: true
# Overflow 32-bit byte offset
# SIVI: [[K_LO:%[0-9]+]]:sreg_32 = S_MOV_B32 0
# SIVI: [[K_HI:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2
+# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], %subreg.sub0, [[K_HI]], %subreg.sub1
# SIVI-DAG: [[K_SUB0:%[0-9]+]]:sgpr_32 = COPY [[K]].sub0
# SIVI-DAG: [[PTR_LO:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub0
# SIVI: [[ADD_PTR_LO:%[0-9]+]]:sreg_32 = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
# SIVI-DAG: [[K_SUB1:%[0-9]+]]:sgpr_32 = COPY [[K]].sub1
# SIVI-DAG: [[PTR_HI:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub1
# SIVI: [[ADD_PTR_HI:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]]
-# SIVI: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2
+# SIVI: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], %subreg.sub0, [[ADD_PTR_HI]], %subreg.sub1
# SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0
# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741824, 0
diff --git a/test/CodeGen/AMDGPU/detect-dead-lanes.mir b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
index b2f5e816b26..12460d25f3b 100644
--- a/test/CodeGen/AMDGPU/detect-dead-lanes.mir
+++ b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
@@ -6,7 +6,7 @@
# CHECK: S_NOP 0, implicit-def %0
# CHECK: S_NOP 0, implicit-def %1
# CHECK: S_NOP 0, implicit-def dead %2
-# CHECK: %3:sreg_128 = REG_SEQUENCE %0, {{[0-9]+}}, %1, {{[0-9]+}}, undef %2, {{[0-9]+}}
+# CHECK: %3:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, undef %2, %subreg.sub3
# CHECK: S_NOP 0, implicit %3.sub0
# CHECK: S_NOP 0, implicit %3.sub1
# CHECK: S_NOP 0, implicit undef %3.sub2
@@ -42,9 +42,9 @@ body: |
# Check defined lanes transfer; Includes checking for some special cases like
# undef operands or IMPLICIT_DEF definitions.
# CHECK-LABEL: name: test1
-# CHECK: %0:sreg_128 = REG_SEQUENCE %sgpr0, {{[0-9]+}}, %sgpr0, {{[0-9]+}}
-# CHECK: %1:sreg_128 = INSERT_SUBREG %0, %sgpr1, {{[0-9]+}}
-# CHECK: %2:sreg_64 = INSERT_SUBREG %0.sub2_sub3, %sgpr42, {{[0-9]+}}
+# CHECK: %0:sreg_128 = REG_SEQUENCE %sgpr0, %subreg.sub0, %sgpr0, %subreg.sub2
+# CHECK: %1:sreg_128 = INSERT_SUBREG %0, %sgpr1, %subreg.sub3
+# CHECK: %2:sreg_64 = INSERT_SUBREG %0.sub2_sub3, %sgpr42, %subreg.sub0
# CHECK: S_NOP 0, implicit %1.sub0
# CHECK: S_NOP 0, implicit undef %1.sub1
# CHECK: S_NOP 0, implicit %1.sub2
@@ -53,24 +53,24 @@ body: |
# CHECK: S_NOP 0, implicit undef %2.sub1
# CHECK: %3:sreg_32_xm0 = IMPLICIT_DEF
-# CHECK: %4:sreg_128 = INSERT_SUBREG %0, undef %3, {{[0-9]+}}
+# CHECK: %4:sreg_128 = INSERT_SUBREG %0, undef %3, %subreg.sub0
# CHECK: S_NOP 0, implicit undef %4.sub0
# CHECK: S_NOP 0, implicit undef %4.sub1
# CHECK: S_NOP 0, implicit %4.sub2
# CHECK: S_NOP 0, implicit undef %4.sub3
-# CHECK: %5:sreg_64 = EXTRACT_SUBREG %0, {{[0-9]+}}
-# CHECK: %6:sreg_32_xm0 = EXTRACT_SUBREG %5, {{[0-9]+}}
-# CHECK: %7:sreg_32_xm0 = EXTRACT_SUBREG %5, {{[0-9]+}}
+# CHECK: %5:sreg_64 = EXTRACT_SUBREG %0, %subreg.sub0_sub1
+# CHECK: %6:sreg_32_xm0 = EXTRACT_SUBREG %5, %subreg.sub0
+# CHECK: %7:sreg_32_xm0 = EXTRACT_SUBREG %5, %subreg.sub1
# CHECK: S_NOP 0, implicit %5
# CHECK: S_NOP 0, implicit %6
# CHECK: S_NOP 0, implicit undef %7
# CHECK: %8:sreg_64 = IMPLICIT_DEF
-# CHECK: %9:sreg_32_xm0 = EXTRACT_SUBREG undef %8, {{[0-9]+}}
+# CHECK: %9:sreg_32_xm0 = EXTRACT_SUBREG undef %8, %subreg.sub1
# CHECK: S_NOP 0, implicit undef %9
-# CHECK: %10:sreg_128 = EXTRACT_SUBREG undef %0, {{[0-9]+}}
+# CHECK: %10:sreg_128 = EXTRACT_SUBREG undef %0, %subreg.sub2_sub3
# CHECK: S_NOP 0, implicit undef %10
name: test1
registers:
@@ -125,29 +125,29 @@ body: |
# CHECK: S_NOP 0, implicit-def dead %0
# CHECK: S_NOP 0, implicit-def %1
# CHECK: S_NOP 0, implicit-def %2
-# CHECK: %3:sreg_128 = REG_SEQUENCE undef %0, {{[0-9]+}}, %1, {{[0-9]+}}, %2, {{[0-9]+}}
+# CHECK: %3:sreg_128 = REG_SEQUENCE undef %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2_sub3
# CHECK: S_NOP 0, implicit %3.sub1
# CHECK: S_NOP 0, implicit %3.sub3
# CHECK: S_NOP 0, implicit-def %4
# CHECK: S_NOP 0, implicit-def dead %5
-# CHECK: %6:sreg_64 = REG_SEQUENCE %4, {{[0-9]+}}, undef %5, {{[0-9]+}}
+# CHECK: %6:sreg_64 = REG_SEQUENCE %4, %subreg.sub0, undef %5, %subreg.sub1
# CHECK: S_NOP 0, implicit %6
# CHECK: S_NOP 0, implicit-def dead %7
# CHECK: S_NOP 0, implicit-def %8
-# CHECK: %9:sreg_128 = INSERT_SUBREG undef %7, %8, {{[0-9]+}}
+# CHECK: %9:sreg_128 = INSERT_SUBREG undef %7, %8, %subreg.sub2_sub3
# CHECK: S_NOP 0, implicit %9.sub2
# CHECK: S_NOP 0, implicit-def %10
# CHECK: S_NOP 0, implicit-def dead %11
-# CHECK: %12:sreg_128 = INSERT_SUBREG %10, undef %11, {{[0-9]+}}
+# CHECK: %12:sreg_128 = INSERT_SUBREG %10, undef %11, %subreg.sub0_sub1
# CHECK: S_NOP 0, implicit %12.sub3
# CHECK: S_NOP 0, implicit-def %13
# CHECK: S_NOP 0, implicit-def dead %14
-# CHECK: %15:sreg_128 = REG_SEQUENCE %13, {{[0-9]+}}, undef %14, {{[0-9]+}}
-# CHECK: %16:sreg_64 = EXTRACT_SUBREG %15, {{[0-9]+}}
+# CHECK: %15:sreg_128 = REG_SEQUENCE %13, %subreg.sub0_sub1, undef %14, %subreg.sub2_sub3
+# CHECK: %16:sreg_64 = EXTRACT_SUBREG %15, %subreg.sub0_sub1
# CHECK: S_NOP 0, implicit %16.sub1
name: test2
@@ -245,7 +245,7 @@ body: |
# used.
# CHECK-LABEL: name: test5
# CHECK: S_NOP 0, implicit-def %0
-# CHECK: %1:sreg_64 = REG_SEQUENCE undef %0, {{[0-9]+}}, %0, {{[0-9]+}}
+# CHECK: %1:sreg_64 = REG_SEQUENCE undef %0, %subreg.sub0, %0, %subreg.sub1
# CHECK: S_NOP 0, implicit %1.sub1
name: test5
tracksRegLiveness: true
@@ -265,7 +265,7 @@ body: |
# CHECK: S_NOP 0, implicit-def %0
# CHECK: S_NOP 0, implicit-def dead %1
# CHECK: S_NOP 0, implicit-def dead %2
-# CHECK: %3:sreg_128 = REG_SEQUENCE %0, {{[0-9]+}}, undef %1, {{[0-9]+}}, undef %2, {{[0-9]+}}
+# CHECK: %3:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, undef %1, %subreg.sub1, undef %2, %subreg.sub2
# CHECK: bb.1:
# CHECK: %4:sreg_128 = PHI %3, %bb.0, %5, %bb.1
@@ -315,12 +315,12 @@ body: |
# CHECK: S_NOP 0, implicit-def %1
# CHECK: S_NOP 0, implicit-def dead %2
# CHECK: S_NOP 0, implicit-def %3
-# CHECK: %4:sreg_128 = REG_SEQUENCE %0, {{[0-9]+}}, %1, {{[0-9]+}}, undef %2, {{[0-9]+}}, %3, {{[0-9]+}}
+# CHECK: %4:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, undef %2, %subreg.sub2, %3, %subreg.sub3
# CHECK: bb.1:
# CHECK: %5:sreg_128 = PHI %4, %bb.0, %6, %bb.1
-# CHECK: %6:sreg_128 = REG_SEQUENCE %5.sub1, {{[0-9]+}}, %5.sub3, {{[0-9]+}}, undef %5.sub2, {{[0-9]+}}, %5.sub0, {{[0-9]+}}
+# CHECK: %6:sreg_128 = REG_SEQUENCE %5.sub1, %subreg.sub0, %5.sub3, %subreg.sub1, undef %5.sub2, %subreg.sub2, %5.sub0, %subreg.sub3
# CHECK: bb.2:
# CHECK: S_NOP 0, implicit %6.sub3
@@ -361,12 +361,12 @@ body: |
# CHECK-LABEL: name: loop2
# CHECK: bb.0:
# CHECK: S_NOP 0, implicit-def %0
-# CHECK: %1:sreg_128 = REG_SEQUENCE %0, {{[0-9]+}}
+# CHECK: %1:sreg_128 = REG_SEQUENCE %0, %subreg.sub0
# CHECK: bb.1:
# CHECK: %2:sreg_128 = PHI %1, %bb.0, %3, %bb.1
-# CHECK: %3:sreg_128 = REG_SEQUENCE %2.sub3, {{[0-9]+}}, undef %2.sub1, {{[0-9]+}}, %2.sub0, {{[0-9]+}}, %2.sub2, {{[0-9]+}}
+# CHECK: %3:sreg_128 = REG_SEQUENCE %2.sub3, %subreg.sub0, undef %2.sub1, %subreg.sub1, %2.sub0, %subreg.sub2, %2.sub2, %subreg.sub3
# CHECK: bb.2:
# CHECK: S_NOP 0, implicit %2.sub0
diff --git a/test/CodeGen/AMDGPU/mad_64_32.ll b/test/CodeGen/AMDGPU/mad_64_32.ll
new file mode 100644
index 00000000000..b4d9d928101
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -0,0 +1,168 @@
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+
+; GCN-LABEL: {{^}}mad_i64_i32_sextops:
+; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
+
+; SI: v_mul_lo_i32
+; SI: v_mul_hi_i32
+; SI: v_add_i32
+; SI: v_addc_u32
+define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
+ %sext0 = sext i32 %arg0 to i64
+ %sext1 = sext i32 %arg1 to i64
+ %mul = mul i64 %sext0, %sext1
+ %mad = add i64 %mul, %arg2
+ ret i64 %mad
+}
+
+; GCN-LABEL: {{^}}mad_i64_i32_sextops_commute:
+; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
+
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_i32
+; SI: v_add_i32
+; SI: v_addc_u32
+define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
+ %sext0 = sext i32 %arg0 to i64
+ %sext1 = sext i32 %arg1 to i64
+ %mul = mul i64 %sext0, %sext1
+ %mad = add i64 %arg2, %mul
+ ret i64 %mad
+}
+
+; GCN-LABEL: {{^}}mad_u64_u32_zextops:
+; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3]
+
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI: v_add_i32
+; SI: v_addc_u32
+define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
+ %sext0 = zext i32 %arg0 to i64
+ %sext1 = zext i32 %arg1 to i64
+ %mul = mul i64 %sext0, %sext1
+ %mad = add i64 %mul, %arg2
+ ret i64 %mad
+}
+
+; GCN-LABEL: {{^}}mad_u64_u32_zextops_commute:
+; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3]
+
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI: v_add_i32
+; SI: v_addc_u32
+define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
+ %sext0 = zext i32 %arg0 to i64
+ %sext1 = zext i32 %arg1 to i64
+ %mul = mul i64 %sext0, %sext1
+ %mad = add i64 %arg2, %mul
+ ret i64 %mad
+}
+
+
+
+
+
+
+; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i128:
+; CI: v_mad_u64_u32
+; CI: v_mad_u64_u32
+; CI: v_mad_u64_u32
+; CI: v_mad_i64_i32
+
+; SI-NOT: v_mad_
+define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
+ %sext0 = sext i32 %arg0 to i128
+ %sext1 = sext i32 %arg1 to i128
+ %mul = mul i128 %sext0, %sext1
+ %mad = add i128 %mul, %arg2
+ ret i128 %mad
+}
+
+; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i63:
+; CI: v_lshl_b64
+; CI: v_ashr
+; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
+
+; SI-NOT: v_mad_u64_u32
+define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
+ %sext0 = sext i32 %arg0 to i63
+ %sext1 = sext i32 %arg1 to i63
+ %mul = mul i63 %sext0, %sext1
+ %mad = add i63 %mul, %arg2
+ ret i63 %mad
+}
+
+; GCN-LABEL: {{^}}mad_i64_i32_sextops_i31_i63:
+; CI: v_lshl_b64
+; CI: v_ashr_i64
+; CI: v_bfe_i32 v1, v1, 0, 31
+; CI: v_bfe_i32 v0, v0, 0, 31
+; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
+define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
+ %sext0 = sext i31 %arg0 to i63
+ %sext1 = sext i31 %arg1 to i63
+ %mul = mul i63 %sext0, %sext1
+ %mad = add i63 %mul, %arg2
+ ret i63 %mad
+}
+
+; GCN-LABEL: {{^}}mad_u64_u32_bitops:
+; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v2, v[4:5]
+define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
+ %trunc.lhs = and i64 %arg0, 4294967295
+ %trunc.rhs = and i64 %arg1, 4294967295
+ %mul = mul i64 %trunc.lhs, %trunc.rhs
+ %add = add i64 %mul, %arg2
+ ret i64 %add
+}
+
+; GCN-LABEL: {{^}}mad_u64_u32_bitops_lhs_mask_small:
+; GCN-NOT: v_mad_
+define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
+ %trunc.lhs = and i64 %arg0, 8589934591
+ %trunc.rhs = and i64 %arg1, 4294967295
+ %mul = mul i64 %trunc.lhs, %trunc.rhs
+ %add = add i64 %mul, %arg2
+ ret i64 %add
+}
+
+; GCN-LABEL: {{^}}mad_u64_u32_bitops_rhs_mask_small:
+; GCN-NOT: v_mad_
+define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
+ %trunc.lhs = and i64 %arg0, 4294967295
+ %trunc.rhs = and i64 %arg1, 8589934591
+ %mul = mul i64 %trunc.lhs, %trunc.rhs
+ %add = add i64 %mul, %arg2
+ ret i64 %add
+}
+
+; GCN-LABEL: {{^}}mad_i64_i32_bitops:
+; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v2, v[4:5]
+; SI-NOT: v_mad_
+define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
+ %shl.lhs = shl i64 %arg0, 32
+ %trunc.lhs = ashr i64 %shl.lhs, 32
+ %shl.rhs = shl i64 %arg1, 32
+ %trunc.rhs = ashr i64 %shl.rhs, 32
+ %mul = mul i64 %trunc.lhs, %trunc.rhs
+ %add = add i64 %mul, %arg2
+ ret i64 %add
+}
+
+; Example from bug report
+; GCN-LABEL: {{^}}mad_i64_i32_unpack_i64ops:
+; CI: v_mad_u64_u32 v[0:1], s[6:7], v1, v0, v[0:1]
+; SI-NOT: v_mad_u64_u32
+define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
+ %tmp4 = lshr i64 %arg0, 32
+ %tmp5 = and i64 %arg0, 4294967295
+ %mul = mul nuw i64 %tmp4, %tmp5
+ %mad = add i64 %mul, %arg0
+ ret i64 %mad
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll
index a0290789175..555c65a6ffe 100644
--- a/test/CodeGen/AMDGPU/mul.ll
+++ b/test/CodeGen/AMDGPU/mul.ll
@@ -1,6 +1,6 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
; mul24 and mad24 are affected
@@ -8,8 +8,8 @@
; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
@@ -26,10 +26,10 @@ define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32
; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
@@ -41,10 +41,10 @@ define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> a
}
; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32:
-; SI: s_load_dword
-; SI: s_load_dword
-; SI: s_mul_i32
-; SI: buffer_store_dword
+; GCN: s_load_dword
+; GCN: s_load_dword
+; GCN: s_mul_i32
+; GCN: buffer_store_dword
define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
%mul = mul i64 %b, %a
%trunc = trunc i64 %mul to i32
@@ -53,10 +53,10 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a
}
; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32:
-; SI: s_load_dword
-; SI: s_load_dword
-; SI: v_mul_lo_i32
-; SI: buffer_store_dword
+; GCN: s_load_dword
+; GCN: s_load_dword
+; GCN: v_mul_lo_i32
+; GCN: buffer_store_dword
define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
%a = load i64, i64 addrspace(1)* %aptr, align 8
%b = load i64, i64 addrspace(1)* %bptr, align 8
@@ -71,8 +71,8 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 ad
; FUNC-LABEL: {{^}}mul64_sext_c:
; EG-DAG: MULLO_INT
; EG-DAG: MULHI_INT
-; SI-DAG: s_mul_i32
-; SI-DAG: v_mul_hi_i32
+; GCN-DAG: s_mul_i32
+; GCN-DAG: v_mul_hi_i32
define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
entry:
%0 = sext i32 %in to i64
@@ -84,9 +84,9 @@ entry:
; FUNC-LABEL: {{^}}v_mul64_sext_c:
; EG-DAG: MULLO_INT
; EG-DAG: MULHI_INT
-; SI-DAG: v_mul_lo_i32
-; SI-DAG: v_mul_hi_i32
-; SI: s_endpgm
+; GCN-DAG: v_mul_lo_i32
+; GCN-DAG: v_mul_hi_i32
+; GCN: s_endpgm
define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
%val = load i32, i32 addrspace(1)* %in, align 4
%ext = sext i32 %val to i64
@@ -96,9 +96,9 @@ define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(
}
; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm:
-; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
-; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
-; SI: s_endpgm
+; GCN-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
+; GCN-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
+; GCN: s_endpgm
define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
%val = load i32, i32 addrspace(1)* %in, align 4
%ext = sext i32 %val to i64
@@ -108,12 +108,12 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 a
}
; FUNC-LABEL: {{^}}s_mul_i32:
-; SI: s_load_dword [[SRC0:s[0-9]+]],
-; SI: s_load_dword [[SRC1:s[0-9]+]],
-; SI: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]]
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
-; SI: s_endpgm
+; GCN: s_load_dword [[SRC0:s[0-9]+]],
+; GCN: s_load_dword [[SRC1:s[0-9]+]],
+; GCN: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]]
+; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; GCN: buffer_store_dword [[VRESULT]],
+; GCN: s_endpgm
define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%mul = mul i32 %a, %b
store i32 %mul, i32 addrspace(1)* %out, align 4
@@ -121,7 +121,7 @@ define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nou
}
; FUNC-LABEL: {{^}}v_mul_i32:
-; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%a = load i32, i32 addrspace(1)* %in
@@ -146,7 +146,7 @@ define amdgpu_kernel void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nou
}
; FUNC-LABEL: {{^}}v_mul_i64:
-; SI: v_mul_lo_i32
+; GCN: v_mul_lo_i32
define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
%a = load i64, i64 addrspace(1)* %aptr, align 8
%b = load i64, i64 addrspace(1)* %bptr, align 8
@@ -156,7 +156,7 @@ define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
}
; FUNC-LABEL: {{^}}mul32_in_branch:
-; SI: s_mul_i32
+; GCN: s_mul_i32
define amdgpu_kernel void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
entry:
%0 = icmp eq i32 %a, 0
@@ -177,9 +177,9 @@ endif:
}
; FUNC-LABEL: {{^}}mul64_in_branch:
-; SI-DAG: s_mul_i32
-; SI-DAG: v_mul_hi_u32
-; SI: s_endpgm
+; GCN-DAG: s_mul_i32
+; GCN-DAG: v_mul_hi_u32
+; GCN: s_endpgm
define amdgpu_kernel void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
entry:
%0 = icmp eq i64 %a, 0
@@ -201,29 +201,41 @@ endif:
; FIXME: Load dwordx4
; FUNC-LABEL: {{^}}s_mul_i128:
-; SI: s_load_dwordx2
-; SI: s_load_dwordx2
-; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
; SI: v_mul_hi_u32
; SI: v_mul_hi_u32
; SI: s_mul_i32
; SI: v_mul_hi_u32
; SI: s_mul_i32
+
; SI-DAG: s_mul_i32
; SI-DAG: v_mul_hi_u32
; SI-DAG: v_mul_hi_u32
; SI-DAG: s_mul_i32
; SI-DAG: s_mul_i32
; SI-DAG: v_mul_hi_u32
+
; SI: s_mul_i32
; SI: s_mul_i32
; SI: s_mul_i32
; SI: s_mul_i32
; SI: s_mul_i32
-; SI: buffer_store_dwordx4
+
+; VI: s_mul_i32
+; VI: v_mul_hi_u32
+; VI: v_mad_u64_u32
+; VI: s_mul_i32
+; VI: v_mul_hi_u32
+; VI: v_mad_u64_u32
+; VI: v_mad_u64_u32
+
+
+; GCN: buffer_store_dwordx4
define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
%mul = mul i128 %a, %b
store i128 %mul, i128 addrspace(1)* %out
@@ -231,18 +243,19 @@ define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b)
}
; FUNC-LABEL: {{^}}v_mul_i128:
-; SI: {{buffer|flat}}_load_dwordx4
-; SI: {{buffer|flat}}_load_dwordx4
+; GCN: {{buffer|flat}}_load_dwordx4
+; GCN: {{buffer|flat}}_load_dwordx4
+
+; GCN-DAG: v_mul_lo_i32
+; GCN-DAG: v_mul_hi_u32
+; GCN-DAG: v_mul_hi_u32
+; GCN-DAG: v_mul_lo_i32
+; GCN-DAG: v_mul_hi_u32
+; GCN-DAG: v_mul_hi_u32
+; GCN-DAG: v_mul_lo_i32
+; GCN-DAG: v_mul_lo_i32
+; GCN-DAG: v_add_i32_e32
-; SI-DAG: v_mul_lo_i32
-; SI-DAG: v_mul_hi_u32
-; SI-DAG: v_mul_hi_u32
-; SI-DAG: v_mul_lo_i32
-; SI-DAG: v_mul_hi_u32
-; SI-DAG: v_mul_hi_u32
-; SI-DAG: v_mul_lo_i32
-; SI-DAG: v_mul_lo_i32
-; SI: v_add_i32_e32
; SI-DAG: v_mul_hi_u32
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_hi_u32
@@ -252,7 +265,11 @@ define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b)
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_lo_i32
-; SI: {{buffer|flat}}_store_dwordx4
+; VI-DAG: v_mad_u64_u32
+; VI: v_mad_u64_u32
+; VI: v_mad_u64_u32
+
+; GCN: {{buffer|flat}}_store_dwordx4
define amdgpu_kernel void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {
%tid = call i32 @llvm.r600.read.tidig.x()
%gep.a = getelementptr inbounds i128, i128 addrspace(1)* %aptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
index 6c6590a154a..9702d18d905 100644
--- a/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
+++ b/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
@@ -5,19 +5,19 @@
# GCN-LABEL: {{^}}name: const_to_sgpr{{$}}
# GCN: %[[HI:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
# GCN-NEXT: %[[LO:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576
-# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2
+# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], %subreg.sub0, killed %[[HI]], %subreg.sub1
# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec
# GCN-LABEL: {{^}}name: const_to_sgpr_multiple_use{{$}}
# GCN: %[[HI:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
# GCN-NEXT: %[[LO:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576
-# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2
+# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], %subreg.sub0, killed %[[HI]], %subreg.sub1
# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec
# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec
# GCN-LABEL: {{^}}name: const_to_sgpr_subreg{{$}}
-# GCN: %[[OP0:[0-9]+]]:vreg_64 = REG_SEQUENCE killed %{{[0-9]+}}, 1, killed %{{[0-9]+}}, 2
+# GCN: %[[OP0:[0-9]+]]:vreg_64 = REG_SEQUENCE killed %{{[0-9]+}}, %subreg.sub0, killed %{{[0-9]+}}, %subreg.sub1
# GCN-NEXT: V_CMP_LT_U32_e64 killed %[[OP0]].sub0, 12, implicit %exec
--- |
@@ -109,7 +109,7 @@ body: |
%8 = S_LOAD_DWORDX2_IMM %3, 11, 0
%6 = COPY %7
%9 = S_MOV_B32 0
- %10 = REG_SEQUENCE %2, 1, killed %9, 2
+ %10 = REG_SEQUENCE %2, %subreg.sub0, killed %9, %subreg.sub1
%0 = COPY %10
%11 = COPY %10.sub0
%12 = COPY %10.sub1
@@ -117,10 +117,10 @@ body: |
%14 = COPY %8.sub1
%15 = S_ADD_U32 killed %11, killed %13, implicit-def %scc
%16 = S_ADDC_U32 killed %12, killed %14, implicit-def dead %scc, implicit %scc
- %17 = REG_SEQUENCE killed %15, 1, killed %16, 2
+ %17 = REG_SEQUENCE killed %15, %subreg.sub0, killed %16, %subreg.sub1
%18 = S_MOV_B32 0
%19 = S_MOV_B32 1048576
- %20 = REG_SEQUENCE killed %19, 1, killed %18, 2
+ %20 = REG_SEQUENCE killed %19, %subreg.sub0, killed %18, %subreg.sub1
%22 = COPY killed %20
%21 = V_CMP_LT_U64_e64 killed %17, %22, implicit %exec
%1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
@@ -133,7 +133,7 @@ body: |
%24 = S_LSHL_B64 %0, killed %23, implicit-def dead %scc
%25 = S_MOV_B32 61440
%26 = S_MOV_B32 0
- %27 = REG_SEQUENCE killed %26, 1, killed %25, 2
+ %27 = REG_SEQUENCE killed %26, %subreg.sub0, killed %25, %subreg.sub1
%28 = REG_SEQUENCE %6, 17, killed %27, 18
%29 = V_MOV_B32_e32 0, implicit %exec
%30 = COPY %24
@@ -208,7 +208,7 @@ body: |
%9 = S_LOAD_DWORDX2_IMM %3, 13, 0
%6 = COPY %7
%10 = S_MOV_B32 0
- %11 = REG_SEQUENCE %2, 1, killed %10, 2
+ %11 = REG_SEQUENCE %2, %subreg.sub0, killed %10, %subreg.sub1
%0 = COPY %11
%12 = COPY %11.sub0
%13 = COPY %11.sub1
@@ -216,15 +216,15 @@ body: |
%15 = COPY %8.sub1
%16 = S_ADD_U32 %12, killed %14, implicit-def %scc
%17 = S_ADDC_U32 %13, killed %15, implicit-def dead %scc, implicit %scc
- %18 = REG_SEQUENCE killed %16, 1, killed %17, 2
+ %18 = REG_SEQUENCE killed %16, %subreg.sub0, killed %17, %subreg.sub1
%19 = COPY %9.sub0
%20 = COPY %9.sub1
%21 = S_ADD_U32 %12, killed %19, implicit-def %scc
%22 = S_ADDC_U32 %13, killed %20, implicit-def dead %scc, implicit %scc
- %23 = REG_SEQUENCE killed %21, 1, killed %22, 2
+ %23 = REG_SEQUENCE killed %21, %subreg.sub0, killed %22, %subreg.sub1
%24 = S_MOV_B32 0
%25 = S_MOV_B32 1048576
- %26 = REG_SEQUENCE killed %25, 1, killed %24, 2
+ %26 = REG_SEQUENCE killed %25, %subreg.sub0, killed %24, %subreg.sub1
%28 = COPY %26
%27 = V_CMP_LT_U64_e64 killed %18, %28, implicit %exec
%29 = V_CMP_LT_U64_e64 killed %23, %28, implicit %exec
@@ -239,7 +239,7 @@ body: |
%33 = S_LSHL_B64 %0, killed %32, implicit-def dead %scc
%34 = S_MOV_B32 61440
%35 = S_MOV_B32 0
- %36 = REG_SEQUENCE killed %35, 1, killed %34, 2
+ %36 = REG_SEQUENCE killed %35, %subreg.sub0, killed %34, %subreg.sub1
%37 = REG_SEQUENCE %6, 17, killed %36, 18
%38 = V_MOV_B32_e32 0, implicit %exec
%39 = COPY %33
@@ -304,7 +304,7 @@ body: |
%8 = S_LOAD_DWORDX2_IMM %3, 11, 0
%6 = COPY %7
%9 = S_MOV_B32 0
- %10 = REG_SEQUENCE %2, 1, killed %9, 2
+ %10 = REG_SEQUENCE %2, %subreg.sub0, killed %9, %subreg.sub1
%0 = COPY %10
%11 = COPY %10.sub0
%12 = COPY %10.sub1
@@ -312,10 +312,10 @@ body: |
%14 = COPY %8.sub1
%15 = S_ADD_U32 killed %11, killed %13, implicit-def %scc
%16 = S_ADDC_U32 killed %12, killed %14, implicit-def dead %scc, implicit %scc
- %17 = REG_SEQUENCE killed %15, 1, killed %16, 2
+ %17 = REG_SEQUENCE killed %15, %subreg.sub0, killed %16, %subreg.sub1
%18 = S_MOV_B32 12
%19 = S_MOV_B32 1048576
- %20 = REG_SEQUENCE killed %19, 1, killed %18, 2
+ %20 = REG_SEQUENCE killed %19, %subreg.sub0, killed %18, %subreg.sub1
%22 = COPY killed %20.sub1
%21 = V_CMP_LT_U32_e64 killed %17.sub0, %22, implicit %exec
%1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
@@ -328,7 +328,7 @@ body: |
%24 = S_LSHL_B64 %0, killed %23, implicit-def dead %scc
%25 = S_MOV_B32 61440
%26 = S_MOV_B32 0
- %27 = REG_SEQUENCE killed %26, 1, killed %25, 2
+ %27 = REG_SEQUENCE killed %26, %subreg.sub0, killed %25, %subreg.sub1
%28 = REG_SEQUENCE %6, 17, killed %27, 18
%29 = V_MOV_B32_e32 0, implicit %exec
%30 = COPY %24
diff --git a/test/CodeGen/AMDGPU/private-memory-r600.ll b/test/CodeGen/AMDGPU/private-memory-r600.ll
index 866cd16ec3b..65e72817429 100644
--- a/test/CodeGen/AMDGPU/private-memory-r600.ll
+++ b/test/CodeGen/AMDGPU/private-memory-r600.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
-; RUN: opt -S -mtriple=r600-unknown-unknown -mcpu=redwood -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: opt -S -mtriple=r600-unknown-unknown-amdgiz -mcpu=redwood -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+target datalayout = "A5"
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -18,19 +19,19 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
entry:
- %stack = alloca [5 x i32], align 4
+ %stack = alloca [5 x i32], align 4, addrspace(5)
%0 = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
- store i32 4, i32* %arrayidx1, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
+ store i32 4, i32 addrspace(5)* %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
- store i32 5, i32* %arrayidx3, align 4
- %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
- %2 = load i32, i32* %arrayidx10, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
+ store i32 5, i32 addrspace(5)* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
+ %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
store i32 %2, i32 addrspace(1)* %out, align 4
- %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
- %3 = load i32, i32* %arrayidx12
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
+ %3 = load i32, i32 addrspace(5)* %arrayidx12
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
store i32 %3, i32 addrspace(1)* %arrayidx13
ret void
@@ -49,20 +50,20 @@ entry:
define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 {
entry:
- %a = alloca %struct.point
- %b = alloca %struct.point
- %a.x.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 0
- %a.y.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 1
- %b.x.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 0
- %b.y.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 1
- store i32 0, i32* %a.x.ptr
- store i32 1, i32* %a.y.ptr
- store i32 2, i32* %b.x.ptr
- store i32 3, i32* %b.y.ptr
- %a.indirect.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 0
- %b.indirect.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 0
- %a.indirect = load i32, i32* %a.indirect.ptr
- %b.indirect = load i32, i32* %b.indirect.ptr
+ %a = alloca %struct.point, addrspace(5)
+ %b = alloca %struct.point, addrspace(5)
+ %a.x.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0
+ %a.y.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 1
+ %b.x.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0
+ %b.y.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 1
+ store i32 0, i32 addrspace(5)* %a.x.ptr
+ store i32 1, i32 addrspace(5)* %a.y.ptr
+ store i32 2, i32 addrspace(5)* %b.x.ptr
+ store i32 3, i32 addrspace(5)* %b.y.ptr
+ %a.indirect.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0
+ %b.indirect.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0
+ %a.indirect = load i32, i32 addrspace(5)* %a.indirect.ptr
+ %b.indirect = load i32, i32 addrspace(5)* %b.indirect.ptr
%0 = add i32 %a.indirect, %b.indirect
store i32 %0, i32 addrspace(1)* %out
ret void
@@ -77,32 +78,32 @@ entry:
define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
entry:
- %prv_array_const = alloca [2 x i32]
- %prv_array = alloca [2 x i32]
+ %prv_array_const = alloca [2 x i32], addrspace(5)
+ %prv_array = alloca [2 x i32], addrspace(5)
%a = load i32, i32 addrspace(1)* %in
%b_src_ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
%b = load i32, i32 addrspace(1)* %b_src_ptr
- %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
- store i32 %a, i32* %a_dst_ptr
- %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1
- store i32 %b, i32* %b_dst_ptr
+ %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0
+ store i32 %a, i32 addrspace(5)* %a_dst_ptr
+ %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 1
+ store i32 %b, i32 addrspace(5)* %b_dst_ptr
br label %for.body
for.body:
%inc = phi i32 [0, %entry], [%count, %for.body]
- %x_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
- %x = load i32, i32* %x_ptr
- %y_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
- %y = load i32, i32* %y_ptr
+ %x_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0
+ %x = load i32, i32 addrspace(5)* %x_ptr
+ %y_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0
+ %y = load i32, i32 addrspace(5)* %y_ptr
%xy = add i32 %x, %y
- store i32 %xy, i32* %y_ptr
+ store i32 %xy, i32 addrspace(5)* %y_ptr
%count = add i32 %inc, 1
%done = icmp eq i32 %count, 4095
br i1 %done, label %for.end, label %for.body
for.end:
- %value_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
- %value = load i32, i32* %value_ptr
+ %value_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0
+ %value = load i32, i32 addrspace(5)* %value_ptr
store i32 %value, i32 addrspace(1)* %out
ret void
}
@@ -112,13 +113,13 @@ for.end:
; R600: MOVA_INT
define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
entry:
- %0 = alloca [2 x i16]
- %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
- %2 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 1
- store i16 0, i16* %1
- store i16 1, i16* %2
- %3 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 %index
- %4 = load i16, i16* %3
+ %0 = alloca [2 x i16], addrspace(5)
+ %1 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 0
+ %2 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 1
+ store i16 0, i16 addrspace(5)* %1
+ store i16 1, i16 addrspace(5)* %2
+ %3 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 %index
+ %4 = load i16, i16 addrspace(5)* %3
%5 = sext i16 %4 to i32
store i32 %5, i32 addrspace(1)* %out
ret void
@@ -129,13 +130,13 @@ entry:
; R600: MOVA_INT
define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
entry:
- %0 = alloca [2 x i8]
- %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
- %2 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 1
- store i8 0, i8* %1
- store i8 1, i8* %2
- %3 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 %index
- %4 = load i8, i8* %3
+ %0 = alloca [2 x i8], addrspace(5)
+ %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 0
+ %2 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 1
+ store i8 0, i8 addrspace(5)* %1
+ store i8 1, i8 addrspace(5)* %2
+ %3 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 %index
+ %4 = load i8, i8 addrspace(5)* %3
%5 = sext i8 %4 to i32
store i32 %5, i32 addrspace(1)* %out
ret void
@@ -150,13 +151,13 @@ entry:
; R600-NOT: MOV * TO.X
define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 {
entry:
- %0 = alloca [2 x i32]
- %1 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 0
- %2 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 1
- store i32 0, i32* %1
- store i32 1, i32* %2
- %3 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 %in
- %4 = load i32, i32* %3
+ %0 = alloca [2 x i32], addrspace(5)
+ %1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 0
+ %2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 1
+ store i32 0, i32 addrspace(5)* %1
+ store i32 1, i32 addrspace(5)* %2
+ %3 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 %in
+ %4 = load i32, i32 addrspace(5)* %3
%5 = call i32 @llvm.r600.read.tidig.x()
%6 = add i32 %4, %5
store i32 %6, i32 addrspace(1)* %out
@@ -171,22 +172,22 @@ entry:
; R600-NOT: [[CHAN]]+
define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
entry:
- %0 = alloca [3 x i8], align 1
- %1 = alloca [2 x i8], align 1
- %2 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 0
- %3 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 1
- %4 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 2
- %5 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 0
- %6 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 1
- store i8 0, i8* %2
- store i8 1, i8* %3
- store i8 2, i8* %4
- store i8 1, i8* %5
- store i8 0, i8* %6
- %7 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 %in
- %8 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 %in
- %9 = load i8, i8* %7
- %10 = load i8, i8* %8
+ %0 = alloca [3 x i8], align 1, addrspace(5)
+ %1 = alloca [2 x i8], align 1, addrspace(5)
+ %2 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 0
+ %3 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 1
+ %4 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 2
+ %5 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 0
+ %6 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 1
+ store i8 0, i8 addrspace(5)* %2
+ store i8 1, i8 addrspace(5)* %3
+ store i8 2, i8 addrspace(5)* %4
+ store i8 1, i8 addrspace(5)* %5
+ store i8 0, i8 addrspace(5)* %6
+ %7 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 %in
+ %8 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 %in
+ %9 = load i8, i8 addrspace(5)* %7
+ %10 = load i8, i8 addrspace(5)* %8
%11 = add i8 %9, %10
%12 = sext i8 %11 to i32
store i32 %12, i32 addrspace(1)* %out
@@ -195,13 +196,13 @@ entry:
define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
entry:
- %alloca = alloca [2 x [2 x i8]]
- %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
- %gep1 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
- store i8 0, i8* %gep0
- store i8 1, i8* %gep1
- %gep2 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
- %load = load i8, i8* %gep2
+ %alloca = alloca [2 x [2 x i8]], addrspace(5)
+ %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 1
+ store i8 0, i8 addrspace(5)* %gep0
+ store i8 1, i8 addrspace(5)* %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index
+ %load = load i8, i8 addrspace(5)* %gep2
%sext = sext i8 %load to i32
store i32 %sext, i32 addrspace(1)* %out
ret void
@@ -209,26 +210,26 @@ entry:
define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
entry:
- %alloca = alloca [2 x [2 x i32]]
- %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
- %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
- store i32 0, i32* %gep0
- store i32 1, i32* %gep1
- %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
- %load = load i32, i32* %gep2
+ %alloca = alloca [2 x [2 x i32]], addrspace(5)
+ %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1
+ store i32 0, i32 addrspace(5)* %gep0
+ store i32 1, i32 addrspace(5)* %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index
+ %load = load i32, i32 addrspace(5)* %gep2
store i32 %load, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
entry:
- %alloca = alloca [2 x [2 x i64]]
- %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
- %gep1 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
- store i64 0, i64* %gep0
- store i64 1, i64* %gep1
- %gep2 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
- %load = load i64, i64* %gep2
+ %alloca = alloca [2 x [2 x i64]], addrspace(5)
+ %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 1
+ store i64 0, i64 addrspace(5)* %gep0
+ store i64 1, i64 addrspace(5)* %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index
+ %load = load i64, i64 addrspace(5)* %gep2
store i64 %load, i64 addrspace(1)* %out
ret void
}
@@ -237,40 +238,40 @@ entry:
define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
entry:
- %alloca = alloca [2 x [2 x %struct.pair32]]
- %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
- %gep1 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
- store i32 0, i32* %gep0
- store i32 1, i32* %gep1
- %gep2 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
- %load = load i32, i32* %gep2
+ %alloca = alloca [2 x [2 x %struct.pair32]], addrspace(5)
+ %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0, i32 1
+ %gep1 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1, i32 1
+ store i32 0, i32 addrspace(5)* %gep0
+ store i32 1, i32 addrspace(5)* %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index, i32 0
+ %load = load i32, i32 addrspace(5)* %gep2
store i32 %load, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
entry:
- %alloca = alloca [2 x %struct.pair32]
- %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
- %gep1 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
- store i32 0, i32* %gep0
- store i32 1, i32* %gep1
- %gep2 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
- %load = load i32, i32* %gep2
+ %alloca = alloca [2 x %struct.pair32], addrspace(5)
+ %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 0, i32 1
+ %gep1 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 1, i32 0
+ store i32 0, i32 addrspace(5)* %gep0
+ store i32 1, i32 addrspace(5)* %gep1
+ %gep2 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 %index, i32 0
+ %load = load i32, i32 addrspace(5)* %gep2
store i32 %load, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
entry:
- %tmp = alloca [2 x i32]
- %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
- %tmp2 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
- store i32 0, i32* %tmp1
- store i32 1, i32* %tmp2
+ %tmp = alloca [2 x i32], addrspace(5)
+ %tmp1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0
+ %tmp2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1
+ store i32 0, i32 addrspace(5)* %tmp1
+ store i32 1, i32 addrspace(5)* %tmp2
%cmp = icmp eq i32 %in, 0
- %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
- %load = load i32, i32* %sel
+ %sel = select i1 %cmp, i32 addrspace(5)* %tmp1, i32 addrspace(5)* %tmp2
+ %load = load i32, i32 addrspace(5)* %sel
store i32 %load, i32 addrspace(1)* %out
ret void
}
@@ -283,14 +284,14 @@ entry:
; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
- %alloca = alloca [16 x i32]
- %tmp0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
- store i32 5, i32* %tmp0
- %tmp1 = ptrtoint [16 x i32]* %alloca to i32
+ %alloca = alloca [16 x i32], addrspace(5)
+ %tmp0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+ store i32 5, i32 addrspace(5)* %tmp0
+ %tmp1 = ptrtoint [16 x i32] addrspace(5)* %alloca to i32
%tmp2 = add i32 %tmp1, 5
- %tmp3 = inttoptr i32 %tmp2 to i32*
- %tmp4 = getelementptr inbounds i32, i32* %tmp3, i32 %b
- %tmp5 = load i32, i32* %tmp4
+ %tmp3 = inttoptr i32 %tmp2 to i32 addrspace(5)*
+ %tmp4 = getelementptr inbounds i32, i32 addrspace(5)* %tmp3, i32 %b
+ %tmp5 = load i32, i32 addrspace(5)* %tmp4
store i32 %tmp5, i32 addrspace(1)* %out
ret void
}
diff --git a/test/CodeGen/AMDGPU/simplify-libcalls.ll b/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 47eb9a9a3d1..aa6c1833bde 100644
--- a/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -1,11 +1,11 @@
-; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-POSTLINK %s
-; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-PRELINK %s
-; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-NATIVE %s
+; RUN: opt -S -O1 -mtriple=amdgcn---amdgiz -amdgpu-simplify-libcall <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-POSTLINK %s
+; RUN: opt -S -O1 -mtriple=amdgcn---amdgiz -amdgpu-simplify-libcall -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-PRELINK %s
+; RUN: opt -S -O1 -mtriple=amdgcn---amdgiz -amdgpu-use-native -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-NATIVE %s
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos
; GCN-POSTLINK: tail call fast float @_Z3sinf(
; GCN-POSTLINK: tail call fast float @_Z3cosf(
-; GCN-PRELINK: call fast float @_Z6sincosfPU3AS4f(
+; GCN-PRELINK: call fast float @_Z6sincosfPf(
; GCN-NATIVE: tail call fast float @_Z10native_sinf(
; GCN-NATIVE: tail call fast float @_Z10native_cosf(
define amdgpu_kernel void @test_sincos(float addrspace(1)* nocapture %a) {
@@ -26,7 +26,7 @@ declare float @_Z3cosf(float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v2
; GCN-POSTLINK: tail call fast <2 x float> @_Z3sinDv2_f(
; GCN-POSTLINK: tail call fast <2 x float> @_Z3cosDv2_f(
-; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPU3AS4S_(
+; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPS_(
; GCN-NATIVE: tail call fast <2 x float> @_Z10native_sinDv2_f(
; GCN-NATIVE: tail call fast <2 x float> @_Z10native_cosDv2_f(
define amdgpu_kernel void @test_sincos_v2(<2 x float> addrspace(1)* nocapture %a) {
@@ -47,7 +47,7 @@ declare <2 x float> @_Z3cosDv2_f(<2 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v3
; GCN-POSTLINK: tail call fast <3 x float> @_Z3sinDv3_f(
; GCN-POSTLINK: tail call fast <3 x float> @_Z3cosDv3_f(
-; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPU3AS4S_(
+; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPS_(
; GCN-NATIVE: tail call fast <3 x float> @_Z10native_sinDv3_f(
; GCN-NATIVE: tail call fast <3 x float> @_Z10native_cosDv3_f(
define amdgpu_kernel void @test_sincos_v3(<3 x float> addrspace(1)* nocapture %a) {
@@ -73,7 +73,7 @@ declare <3 x float> @_Z3cosDv3_f(<3 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v4
; GCN-POSTLINK: tail call fast <4 x float> @_Z3sinDv4_f(
; GCN-POSTLINK: tail call fast <4 x float> @_Z3cosDv4_f(
-; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPU3AS4S_(
+; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPS_(
; GCN-NATIVE: tail call fast <4 x float> @_Z10native_sinDv4_f(
; GCN-NATIVE: tail call fast <4 x float> @_Z10native_cosDv4_f(
define amdgpu_kernel void @test_sincos_v4(<4 x float> addrspace(1)* nocapture %a) {
@@ -94,7 +94,7 @@ declare <4 x float> @_Z3cosDv4_f(<4 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v8
; GCN-POSTLINK: tail call fast <8 x float> @_Z3sinDv8_f(
; GCN-POSTLINK: tail call fast <8 x float> @_Z3cosDv8_f(
-; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPU3AS4S_(
+; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPS_(
; GCN-NATIVE: tail call fast <8 x float> @_Z10native_sinDv8_f(
; GCN-NATIVE: tail call fast <8 x float> @_Z10native_cosDv8_f(
define amdgpu_kernel void @test_sincos_v8(<8 x float> addrspace(1)* nocapture %a) {
@@ -115,7 +115,7 @@ declare <8 x float> @_Z3cosDv8_f(<8 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v16
; GCN-POSTLINK: tail call fast <16 x float> @_Z3sinDv16_f(
; GCN-POSTLINK: tail call fast <16 x float> @_Z3cosDv16_f(
-; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPU3AS4S_(
+; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPS_(
; GCN-NATIVE: tail call fast <16 x float> @_Z10native_sinDv16_f(
; GCN-NATIVE: tail call fast <16 x float> @_Z10native_cosDv16_f(
define amdgpu_kernel void @test_sincos_v16(<16 x float> addrspace(1)* nocapture %a) {
@@ -685,101 +685,101 @@ define amdgpu_kernel void @test_use_native_sincos(float addrspace(1)* %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
- %tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float addrspace(4)*
- %call = tail call fast float @_Z6sincosfPU3AS4f(float %tmp, float addrspace(4)* %tmp1)
+ %tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float*
+ %call = tail call fast float @_Z6sincosfPf(float %tmp, float* %tmp1)
store float %call, float addrspace(1)* %a, align 4
ret void
}
-declare float @_Z6sincosfPU3AS4f(float, float addrspace(4)*)
+declare float @_Z6sincosfPf(float, float*)
%opencl.pipe_t = type opaque
%opencl.reserve_id_t = type opaque
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr)
-; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND:[0-9]+]]
-; GCN-PRELINK: call i32 @__read_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 2, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32* %{{.*}}) #[[NOUNWIND:[0-9]+]]
+; GCN-PRELINK: call i32 @__read_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t addrspace(5)* %{{.*}}, i32 2, i32* %{{.*}}) #[[NOUNWIND]]
define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr {
entry:
%tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
- %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8 addrspace(4)*
- %tmp2 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
- %tmp3 = tail call %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4)
- %tmp4 = tail call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 2, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
- tail call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 4, i32 4)
+ %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8*
+ %tmp2 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
+ %tmp3 = tail call %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4)
+ %tmp4 = tail call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
+ tail call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4)
ret void
}
-declare i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)*, i8 addrspace(4)*, i32, i32)
+declare i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)*, i8*, i32, i32)
-declare %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32)
+declare %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32)
-declare i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i8 addrspace(4)*, i32, i32)
+declare i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i8*, i32, i32)
-declare void @__commit_read_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i32)
+declare void @__commit_read_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i32)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr)
-; GCN-PRELINK: call i32 @__write_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__write_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 2, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__write_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__write_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t addrspace(5)* %{{.*}}, i32 2, i32* %{{.*}}) #[[NOUNWIND]]
define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr {
entry:
%tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
- %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8 addrspace(4)*
- %tmp2 = tail call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
- %tmp3 = tail call %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0
- %tmp4 = tail call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 2, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
- tail call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 4, i32 4) #0
+ %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8*
+ %tmp2 = tail call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
+ %tmp3 = tail call %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0
+ %tmp4 = tail call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
+ tail call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4) #0
ret void
}
-declare i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)*, i8 addrspace(4)*, i32, i32) local_unnamed_addr
+declare i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)*, i8*, i32, i32) local_unnamed_addr
-declare %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) local_unnamed_addr
+declare %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) local_unnamed_addr
-declare i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i8 addrspace(4)*, i32, i32) local_unnamed_addr
+declare i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i8*, i32, i32) local_unnamed_addr
-declare void @__commit_write_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i32) local_unnamed_addr
+declare void @__commit_write_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i32) local_unnamed_addr
%struct.S = type { [100 x i32] }
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pipe_size
-; GCN-PRELINK: call i32 @__read_pipe_2_1(%opencl.pipe_t addrspace(1)* %{{.*}} i8 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_2(%opencl.pipe_t addrspace(1)* %{{.*}} i16 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}} i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_8(%opencl.pipe_t addrspace(1)* %{{.*}} i64 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_16(%opencl.pipe_t addrspace(1)* %{{.*}}, <2 x i64> addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_32(%opencl.pipe_t addrspace(1)* %{{.*}}, <4 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_64(%opencl.pipe_t addrspace(1)* %{{.*}}, <8 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_128(%opencl.pipe_t addrspace(1)* %{{.*}}, <16 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %{{.*}}, i8 addrspace(4)* %{{.*}} i32 400, i32 4) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_1(%opencl.pipe_t addrspace(1)* %{{.*}} i8* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_2(%opencl.pipe_t addrspace(1)* %{{.*}} i16* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}} i32* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_8(%opencl.pipe_t addrspace(1)* %{{.*}} i64* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_16(%opencl.pipe_t addrspace(1)* %{{.*}}, <2 x i64>* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_32(%opencl.pipe_t addrspace(1)* %{{.*}}, <4 x i64>* %{{.*}} #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_64(%opencl.pipe_t addrspace(1)* %{{.*}}, <8 x i64>* %{{.*}} #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_128(%opencl.pipe_t addrspace(1)* %{{.*}}, <16 x i64>* %{{.*}} #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %{{.*}}, i8* %{{.*}} i32 400, i32 4) #[[NOUNWIND]]
define amdgpu_kernel void @test_pipe_size(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(1)* %ptr1, %opencl.pipe_t addrspace(1)* %p2, i16 addrspace(1)* %ptr2, %opencl.pipe_t addrspace(1)* %p4, i32 addrspace(1)* %ptr4, %opencl.pipe_t addrspace(1)* %p8, i64 addrspace(1)* %ptr8, %opencl.pipe_t addrspace(1)* %p16, <2 x i64> addrspace(1)* %ptr16, %opencl.pipe_t addrspace(1)* %p32, <4 x i64> addrspace(1)* %ptr32, %opencl.pipe_t addrspace(1)* %p64, <8 x i64> addrspace(1)* %ptr64, %opencl.pipe_t addrspace(1)* %p128, <16 x i64> addrspace(1)* %ptr128, %opencl.pipe_t addrspace(1)* %pu, %struct.S addrspace(1)* %ptru) local_unnamed_addr #0 {
entry:
- %tmp = addrspacecast i8 addrspace(1)* %ptr1 to i8 addrspace(4)*
- %tmp1 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(4)* %tmp, i32 1, i32 1) #0
+ %tmp = addrspacecast i8 addrspace(1)* %ptr1 to i8*
+ %tmp1 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8* %tmp, i32 1, i32 1) #0
%tmp2 = bitcast i16 addrspace(1)* %ptr2 to i8 addrspace(1)*
- %tmp3 = addrspacecast i8 addrspace(1)* %tmp2 to i8 addrspace(4)*
- %tmp4 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8 addrspace(4)* %tmp3, i32 2, i32 2) #0
+ %tmp3 = addrspacecast i8 addrspace(1)* %tmp2 to i8*
+ %tmp4 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8* %tmp3, i32 2, i32 2) #0
%tmp5 = bitcast i32 addrspace(1)* %ptr4 to i8 addrspace(1)*
- %tmp6 = addrspacecast i8 addrspace(1)* %tmp5 to i8 addrspace(4)*
- %tmp7 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8 addrspace(4)* %tmp6, i32 4, i32 4) #0
+ %tmp6 = addrspacecast i8 addrspace(1)* %tmp5 to i8*
+ %tmp7 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8* %tmp6, i32 4, i32 4) #0
%tmp8 = bitcast i64 addrspace(1)* %ptr8 to i8 addrspace(1)*
- %tmp9 = addrspacecast i8 addrspace(1)* %tmp8 to i8 addrspace(4)*
- %tmp10 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8 addrspace(4)* %tmp9, i32 8, i32 8) #0
+ %tmp9 = addrspacecast i8 addrspace(1)* %tmp8 to i8*
+ %tmp10 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8* %tmp9, i32 8, i32 8) #0
%tmp11 = bitcast <2 x i64> addrspace(1)* %ptr16 to i8 addrspace(1)*
- %tmp12 = addrspacecast i8 addrspace(1)* %tmp11 to i8 addrspace(4)*
- %tmp13 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8 addrspace(4)* %tmp12, i32 16, i32 16) #0
+ %tmp12 = addrspacecast i8 addrspace(1)* %tmp11 to i8*
+ %tmp13 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8* %tmp12, i32 16, i32 16) #0
%tmp14 = bitcast <4 x i64> addrspace(1)* %ptr32 to i8 addrspace(1)*
- %tmp15 = addrspacecast i8 addrspace(1)* %tmp14 to i8 addrspace(4)*
- %tmp16 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8 addrspace(4)* %tmp15, i32 32, i32 32) #0
+ %tmp15 = addrspacecast i8 addrspace(1)* %tmp14 to i8*
+ %tmp16 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8* %tmp15, i32 32, i32 32) #0
%tmp17 = bitcast <8 x i64> addrspace(1)* %ptr64 to i8 addrspace(1)*
- %tmp18 = addrspacecast i8 addrspace(1)* %tmp17 to i8 addrspace(4)*
- %tmp19 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8 addrspace(4)* %tmp18, i32 64, i32 64) #0
+ %tmp18 = addrspacecast i8 addrspace(1)* %tmp17 to i8*
+ %tmp19 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8* %tmp18, i32 64, i32 64) #0
%tmp20 = bitcast <16 x i64> addrspace(1)* %ptr128 to i8 addrspace(1)*
- %tmp21 = addrspacecast i8 addrspace(1)* %tmp20 to i8 addrspace(4)*
- %tmp22 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8 addrspace(4)* %tmp21, i32 128, i32 128) #0
+ %tmp21 = addrspacecast i8 addrspace(1)* %tmp20 to i8*
+ %tmp22 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8* %tmp21, i32 128, i32 128) #0
%tmp23 = bitcast %struct.S addrspace(1)* %ptru to i8 addrspace(1)*
- %tmp24 = addrspacecast i8 addrspace(1)* %tmp23 to i8 addrspace(4)*
- %tmp25 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8 addrspace(4)* %tmp24, i32 400, i32 4) #0
+ %tmp24 = addrspacecast i8 addrspace(1)* %tmp23 to i8*
+ %tmp25 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8* %tmp24, i32 400, i32 4) #0
ret void
}
diff --git a/test/CodeGen/AMDGPU/unknown-processor.ll b/test/CodeGen/AMDGPU/unknown-processor.ll
index e25f2235993..6dfcff77d81 100644
--- a/test/CodeGen/AMDGPU/unknown-processor.ll
+++ b/test/CodeGen/AMDGPU/unknown-processor.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
-; RUN: llc -march=r600 -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
+; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s
+target datalayout = "A5"
; Should not crash when the processor is not recognized and the
; wavefront size feature not set.
@@ -14,7 +15,7 @@
; R600: MOV
define amdgpu_kernel void @foo() {
- %alloca = alloca i32, align 4
- store volatile i32 0, i32* %alloca
+ %alloca = alloca i32, align 4, addrspace(5)
+ store volatile i32 0, i32 addrspace(5)* %alloca
ret void
}
diff --git a/test/CodeGen/AMDGPU/unsupported-calls.ll b/test/CodeGen/AMDGPU/unsupported-calls.ll
index 990b25e0c59..68872c54f7f 100644
--- a/test/CodeGen/AMDGPU/unsupported-calls.ll
+++ b/test/CodeGen/AMDGPU/unsupported-calls.ll
@@ -1,5 +1,5 @@
-; RUN: not llc -march=amdgcn -tailcallopt < %s 2>&1 | FileCheck -check-prefix=GCN %s
-; RUN: not llc -march=r600 -mcpu=cypress -tailcallopt < %s 2>&1 | FileCheck -check-prefix=R600 %s
+; RUN: not llc -march=amdgcn -mtriple=amdgcn---amdgiz -tailcallopt < %s 2>&1 | FileCheck -check-prefix=GCN %s
+; RUN: not llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress -tailcallopt < %s 2>&1 | FileCheck -check-prefix=R600 %s
declare i32 @external_function(i32) nounwind
diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir
index d96463f00c7..939c851584c 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir
@@ -1,6 +1,7 @@
# RUN: llc -O0 -mtriple arm-- -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
--- |
define void @test_mla() #0 { ret void }
+ define void @test_mla_commutative() #0 { ret void }
define void @test_mla_v5() #1 { ret void }
define void @test_mls() #2 { ret void }
@@ -45,6 +46,40 @@ body: |
; CHECK: BX_RET 14, _, implicit %r0
...
---
+name: test_mla_commutative
+# CHECK-LABEL: name: test_mla_commutative
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: gprb }
+ - { id: 1, class: gprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+ - { id: 4, class: gprb }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ ; CHECK: [[VREGX:%[0-9]+]]:gprnopc = COPY %r0
+ ; CHECK: [[VREGY:%[0-9]+]]:gprnopc = COPY %r1
+ ; CHECK: [[VREGZ:%[0-9]+]]:gprnopc = COPY %r2
+
+ %3(s32) = G_MUL %0, %1
+ %4(s32) = G_ADD %2, %3
+ ; CHECK: [[VREGR:%[0-9]+]]:gprnopc = MLA [[VREGX]], [[VREGY]], [[VREGZ]], 14, _, _
+
+ %r0 = COPY %4(s32)
+ ; CHECK: %r0 = COPY [[VREGR]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
name: test_mla_v5
# CHECK-LABEL: name: test_mla_v5
legalized: true
diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
index 0fdd485ba90..588ceaca2c4 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
@@ -970,9 +970,10 @@ registers:
- { id: 1, class: gprb }
- { id: 2, class: gprb }
- { id: 3, class: gprb }
+ - { id: 4, class: gprb }
body: |
bb.0:
- liveins: %r0, %r1
+ liveins: %r0, %r1, %r2
%0(p0) = COPY %r0
; CHECK: [[VREGX:%[0-9]+]]:gpr = COPY %r0
@@ -980,14 +981,17 @@ body: |
%1(p0) = COPY %r1
; CHECK: [[VREGY:%[0-9]+]]:gpr = COPY %r1
- %2(s1) = G_TRUNC %1(p0)
- ; CHECK: [[VREGC:%[0-9]+]]:gpr = COPY [[VREGY]]
+ %2(s32) = COPY %r2
+ ; CHECK: [[VREGC:%[0-9]+]]:gpr = COPY %r2
- %3(p0) = G_SELECT %2(s1), %0, %1
- ; CHECK: CMPri [[VREGC]], 0, 14, _, implicit-def %cpsr
+ %3(s1) = G_TRUNC %2(s32)
+ ; CHECK: [[VREGD:%[0-9]+]]:gpr = COPY [[VREGC]]
+
+ %4(p0) = G_SELECT %3(s1), %0, %1
+ ; CHECK: CMPri [[VREGD]], 0, 14, _, implicit-def %cpsr
; CHECK: [[RES:%[0-9]+]]:gpr = MOVCCr [[VREGX]], [[VREGY]], 0, %cpsr
- %r0 = COPY %3(p0)
+ %r0 = COPY %4(p0)
; CHECK: %r0 = COPY [[RES]]
BX_RET 14, _, implicit %r0
diff --git a/test/CodeGen/Generic/llc-start-stop.ll b/test/CodeGen/Generic/llc-start-stop.ll
index 85b69c37aa0..9056e2cab49 100644
--- a/test/CodeGen/Generic/llc-start-stop.ll
+++ b/test/CodeGen/Generic/llc-start-stop.ll
@@ -13,15 +13,15 @@
; STOP-BEFORE-NOT: Loop Strength Reduction
; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER
-; START-AFTER: -machine-branch-prob -gc-lowering
+; START-AFTER: -machine-branch-prob -expandmemcmp
; START-AFTER: FunctionPass Manager
-; START-AFTER-NEXT: Lower Garbage Collection Instructions
+; START-AFTER-NEXT: Expand memcmp() to load/stores
; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE
; START-BEFORE: -machine-branch-prob -domtree
; START-BEFORE: FunctionPass Manager
; START-BEFORE: Loop Strength Reduction
-; START-BEFORE-NEXT: Lower Garbage Collection Instructions
+; START-BEFORE-NEXT: Expand memcmp() to load/stores
; RUN: not llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE
; RUN: not llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE
diff --git a/test/CodeGen/Hexagon/isel-prefer.ll b/test/CodeGen/Hexagon/isel-prefer.ll
index 062b0b3a0ea..7094544f54b 100644
--- a/test/CodeGen/Hexagon/isel-prefer.ll
+++ b/test/CodeGen/Hexagon/isel-prefer.ll
@@ -54,4 +54,14 @@ b2:
ret i32 %v6
}
+; CHECK-LABEL: Prefer_L2_loadrub_io:
+; CHECK: memub(r0+#65)
+define i64 @Prefer_L2_loadrub_io(i8* %a0) #0 {
+b1:
+ %v2 = getelementptr i8, i8* %a0, i32 65
+ %v3 = load i8, i8* %v2
+ %v4 = zext i8 %v3 to i64
+ ret i64 %v4
+}
+
attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/MIR/X86/subregister-index-operands.mir b/test/CodeGen/MIR/X86/subregister-index-operands.mir
index e3c5b9d17ee..4d8b24608b7 100644
--- a/test/CodeGen/MIR/X86/subregister-index-operands.mir
+++ b/test/CodeGen/MIR/X86/subregister-index-operands.mir
@@ -22,9 +22,9 @@ body: |
liveins: %edi, %eax
; CHECK-LABEL: name: t
; CHECK: liveins: %edi, %eax
- ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:gr32 = INSERT_SUBREG %edi, %al, 1
- ; CHECK: [[EXTRACT_SUBREG:%[0-9]+]]:gr8 = EXTRACT_SUBREG %eax, 2
- ; CHECK: %ax = REG_SEQUENCE [[EXTRACT_SUBREG]], 1, [[EXTRACT_SUBREG]], 2
+ ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:gr32 = INSERT_SUBREG %edi, %al, %subreg.sub_8bit
+ ; CHECK: [[EXTRACT_SUBREG:%[0-9]+]]:gr8 = EXTRACT_SUBREG %eax, %subreg.sub_8bit_hi
+ ; CHECK: %ax = REG_SEQUENCE [[EXTRACT_SUBREG]], %subreg.sub_8bit, [[EXTRACT_SUBREG]], %subreg.sub_8bit_hi
; CHECK: RETQ %ax
%0 = INSERT_SUBREG %edi, %al, %subreg.sub_8bit
%1 = EXTRACT_SUBREG %eax, %subreg.sub_8bit_hi
diff --git a/test/CodeGen/Mips/brind-tailcall.ll b/test/CodeGen/Mips/brind-tailcall.ll
new file mode 100644
index 00000000000..78fb0f15107
--- /dev/null
+++ b/test/CodeGen/Mips/brind-tailcall.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=pic < %s 2>&1 | FileCheck --check-prefix=PIC %s
+; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=static < %s 2>&1 | FileCheck --check-prefix=STATIC %s
+; RUN: llc -march=mips64 -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=pic < %s 2>&1 | FileCheck --check-prefix=PIC64 %s
+; RUN: llc -march=mips64 -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=static < %s 2>&1 | FileCheck --check-prefix=STATIC64 %s
+; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=pic -mattr=+micromips < %s 2>&1 | FileCheck --check-prefix=PIC %s
+; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=static -mattr=+micromips < %s 2>&1 | FileCheck --check-prefix=STATIC-MM %s
+; RUN: llc -march=mips -mcpu=mips32r6 -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=pic -mattr=+micromips < %s 2>&1 | FileCheck --check-prefix=PIC %s
+; RUN: llc -march=mips -mcpu=mips32r6 -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=static -mattr=+micromips < %s 2>&1 | FileCheck --check-prefix=STATIC-MM %s
+; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=pic -mattr=+mips16 < %s 2>&1 | FileCheck --check-prefix=MIPS16 %s
+; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \
+; RUN: -relocation-model=static -mattr=+mips16 < %s 2>&1 | FileCheck --check-prefix=MIPS16 %s
+
+; REQUIRES: asserts
+
+; Test that the correct pseudo instructions are generated for indirect
+; branches and tail calls. Previously, the order of the DAG matcher table
+; determined if the correct instruction was selected for mips16.
+
+declare protected void @a()
+
+define void @test1(i32 %a) {
+entry:
+ %0 = trunc i32 %a to i1
+ %1 = select i1 %0,
+ i8* blockaddress(@test1, %bb),
+ i8* blockaddress(@test1, %bb6)
+ indirectbr i8* %1, [label %bb, label %bb6]
+
+; STATIC: PseudoIndirectBranch
+; STATIC-MM: PseudoIndirectBranch
+; STATIC-NOT: PseudoIndirectBranch64
+; STATIC64: PseudoIndirectBranch64
+; PIC: PseudoIndirectBranch
+; PIC-NOT: PseudoIndirectBranch64
+; PIC64: PseudoIndirectBranch64
+; MIPS16: JrcRx16
+bb:
+ ret void
+
+bb6:
+ tail call void @a()
+
+; STATIC: TAILCALL
+; STATIC-NOT: TAILCALL_MM
+; STATIC-MM: TAILCALL_MM
+; PIC: TAILCALLREG
+; PIC-NOT: TAILCALLREG64
+; PIC64: TAILCALLREG64
+; MIPS16: RetRA16
+ ret void
+}
diff --git a/test/CodeGen/Mips/dins.ll b/test/CodeGen/Mips/dins.ll
index 8a8b377861a..2f7138ca4c5 100644
--- a/test/CodeGen/Mips/dins.ll
+++ b/test/CodeGen/Mips/dins.ll
@@ -1,7 +1,11 @@
-; RUN: llc -O2 -march=mips64 -mcpu=mips64r2 -target-abi=n64 < %s -o - | FileCheck %s -check-prefix=MIPS64R2
-; RUN: llc -O2 -march=mips -mcpu=mips32r2 < %s -o - | FileCheck %s -check-prefix=MIPS32R2
-; RUN: llc -O2 -march=mips -mattr=mips16 < %s -o - | FileCheck %s -check-prefix=MIPS16
-; RUN: llc -O2 -march=mips64 -mcpu=mips64r2 -target-abi=n32 < %s -o - | FileCheck %s -check-prefix=MIPS64R2N32
+; RUN: llc -O2 -verify-machineinstrs -march=mips64 -mcpu=mips64r2 \
+; RUN: -target-abi=n64 < %s -o - | FileCheck %s -check-prefix=MIPS64R2
+; RUN: llc -O2 -verify-machineinstrs -march=mips -mcpu=mips32r2 < %s -o - \
+; RUN: | FileCheck %s -check-prefix=MIPS32R2
+; RUN: llc -O2 -verify-machineinstrs -march=mips -mattr=mips16 < %s -o - \
+; RUN: | FileCheck %s -check-prefix=MIPS16
+; RUN: llc -O2 -verify-machineinstrs -march=mips64 -mcpu=mips64r2 \
+; RUN: -target-abi=n32 < %s -o - | FileCheck %s -check-prefix=MIPS64R2N32
; #include <stdint.h>
; #include <stdio.h>
@@ -60,7 +64,7 @@ entry:
; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 123
; MIPS64R2: dinsm $[[R0:[0-9]+]], $[[R1:[0-9]+]], 27, 37
; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 4
-; MIPS64R2: dins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 28, 6
+; MIPS64R2: dinsm $[[R0:[0-9]+]], $[[R1:[0-9]+]], 28, 6
; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 5
; MIPS64R2: dinsu $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50, 14
; MIPS64R2: dsrl $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50
diff --git a/test/CodeGen/Mips/msa/emergency-spill.mir b/test/CodeGen/Mips/msa/emergency-spill.mir
new file mode 100644
index 00000000000..502b60f673e
--- /dev/null
+++ b/test/CodeGen/Mips/msa/emergency-spill.mir
@@ -0,0 +1,221 @@
+# RUN: llc %s -start-after=shrink-wrap -march=mips64 -mcpu=mips64r6 -mattr=+fp64,+msa -o /dev/null
+
+# Test that estimated size of the stack leads to the creation of an emergency
+# spill when MSA is in use. Previously, this test case would fail during
+# register scavenging due to the lack of a spill slot.
+--- |
+ define inreg { i64, i64 } @test(i64 inreg %a.coerce0, i64 inreg %a.coerce1, i64 inreg %b.coerce0, i64 inreg %b.coerce1, i32 signext %c) #0 {
+ entry:
+ %retval = alloca <16 x i8>, align 16
+ %a = alloca <16 x i8>, align 16
+ %b = alloca <16 x i8>, align 16
+ %a.addr = alloca <16 x i8>, align 16
+ %b.addr = alloca <16 x i8>, align 16
+ %c.addr = alloca i32, align 4
+ %g = alloca <16 x i8>*, align 8
+ %d = alloca i8*, align 8
+ %0 = bitcast <16 x i8>* %a to { i64, i64 }*
+ %1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i32 0, i32 0
+ store i64 %a.coerce0, i64* %1, align 16
+ %2 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i32 0, i32 1
+ store i64 %a.coerce1, i64* %2, align 8
+ %a1 = load <16 x i8>, <16 x i8>* %a, align 16
+ %3 = bitcast <16 x i8>* %b to { i64, i64 }*
+ %4 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %3, i32 0, i32 0
+ store i64 %b.coerce0, i64* %4, align 16
+ %5 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %3, i32 0, i32 1
+ store i64 %b.coerce1, i64* %5, align 8
+ %b2 = load <16 x i8>, <16 x i8>* %b, align 16
+ store <16 x i8> %a1, <16 x i8>* %a.addr, align 16
+ store <16 x i8> %b2, <16 x i8>* %b.addr, align 16
+ store i32 %c, i32* %c.addr, align 4
+ %6 = alloca i8, i64 6400, align 16
+ %7 = bitcast i8* %6 to <16 x i8>*
+ store <16 x i8>* %7, <16 x i8>** %g, align 8
+ %8 = load <16 x i8>*, <16 x i8>** %g, align 8
+ call void @h(<16 x i8>* %b.addr, <16 x i8>* %8)
+ %9 = load <16 x i8>*, <16 x i8>** %g, align 8
+ %10 = bitcast <16 x i8>* %9 to i8*
+ store i8* %10, i8** %d, align 8
+ %11 = load <16 x i8>, <16 x i8>* %a.addr, align 16
+ %12 = load i8*, i8** %d, align 8
+ %arrayidx = getelementptr inbounds i8, i8* %12, i64 0
+ %13 = load i8, i8* %arrayidx, align 1
+ %conv = sext i8 %13 to i32
+ %14 = call <16 x i8> @llvm.mips.fill.b(i32 %conv)
+ %add = add <16 x i8> %11, %14
+ %15 = load i8*, i8** %d, align 8
+ %arrayidx3 = getelementptr inbounds i8, i8* %15, i64 1
+ %16 = load i8, i8* %arrayidx3, align 1
+ %conv4 = sext i8 %16 to i32
+ %17 = call <16 x i8> @llvm.mips.fill.b(i32 %conv4)
+ %add5 = add <16 x i8> %add, %17
+ %18 = load <16 x i8>, <16 x i8>* %b.addr, align 16
+ %add6 = add <16 x i8> %18, %add5
+ store <16 x i8> %add6, <16 x i8>* %b.addr, align 16
+ %19 = load <16 x i8>, <16 x i8>* %b.addr, align 16
+ store <16 x i8> %19, <16 x i8>* %retval, align 16
+ %20 = bitcast <16 x i8>* %retval to { i64, i64 }*
+ %21 = load { i64, i64 }, { i64, i64 }* %20, align 16
+ ret { i64, i64 } %21
+ }
+
+ declare void @h(<16 x i8>*, <16 x i8>*)
+
+ declare <16 x i8> @llvm.mips.fill.b(i32)
+
+ declare void @llvm.stackprotector(i8*, i8**)
+
+...
+---
+name: test
+alignment: 3
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+liveins:
+ - { reg: '%a0_64', virtual-reg: '' }
+ - { reg: '%a1_64', virtual-reg: '' }
+ - { reg: '%a2_64', virtual-reg: '' }
+ - { reg: '%a3_64', virtual-reg: '' }
+ - { reg: '%t0_64', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 16
+ adjustsStack: false
+ hasCalls: true
+ stackProtector: ''
+ maxCallFrameSize: 4294967295
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+ - { id: 0, name: retval, type: default, offset: 0, size: 16, alignment: 16,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 1, name: a, type: default, offset: 0, size: 16, alignment: 16,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 2, name: b, type: default, offset: 0, size: 16, alignment: 16,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 3, name: a.addr, type: default, offset: 0, size: 16, alignment: 16,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 4, name: b.addr, type: default, offset: 0, size: 16, alignment: 16,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 5, name: c.addr, type: default, offset: 0, size: 4, alignment: 4,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 6, name: g, type: default, offset: 0, size: 8, alignment: 8,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 7, name: d, type: default, offset: 0, size: 8, alignment: 8,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 8, name: '', type: default, offset: 0, size: 6400,
+ alignment: 16, stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+constants:
+body: |
+ bb.0.entry:
+ liveins: %a0_64, %a1_64, %a2_64, %a3_64, %t0_64
+
+ SD killed %a0_64, %stack.1.a, 0 :: (store 8 into %ir.1, align 16)
+ SD killed %a1_64, %stack.1.a, 8 :: (store 8 into %ir.2)
+ %w0 = LD_B %stack.1.a, 0 :: (dereferenceable load 16 from %ir.a)
+ SD killed %a2_64, %stack.2.b, 0 :: (store 8 into %ir.4, align 16)
+ SD killed %a3_64, %stack.2.b, 8 :: (store 8 into %ir.5)
+ %w1 = LD_B %stack.2.b, 0 :: (dereferenceable load 16 from %ir.b)
+ ST_B killed %w0, %stack.3.a.addr, 0 :: (store 16 into %ir.a.addr)
+ ST_B killed %w1, %stack.4.b.addr, 0 :: (store 16 into %ir.b.addr)
+ SW %t0, %stack.5.c.addr, 0, implicit killed %t0_64 :: (store 4 into %ir.c.addr)
+ %at_64 = LEA_ADDiu64 %stack.8, 0
+ SD killed %at_64, %stack.6.g, 0 :: (store 8 into %ir.g)
+ %a1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp
+ %a0_64 = LEA_ADDiu64 %stack.4.b.addr, 0
+ JAL @h, csr_n64, implicit-def dead %ra, implicit %a0_64, implicit %a1_64, implicit-def %sp
+ ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp
+ %at_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %v0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %v1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %a0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %a1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %a2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %a3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t4_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t5_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t6_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t7_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s4_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s5_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s6_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %s7_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t8_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %t9_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %ra_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+ %w0 = LD_B %stack.3.a.addr, 0 :: (dereferenceable load 16 from %ir.a.addr)
+ SD %at_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %v0_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %v1_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %a0_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %a1_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %a2_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %a3_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t0_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t1_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t2_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t3_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t4_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t5_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t6_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t7_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s0_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s1_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s2_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s3_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s4_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s5_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s6_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %s7_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t8_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %t9_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ SD %ra_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+ %at_64 = LD %stack.7.d, 0 :: (dereferenceable load 8 from %ir.d)
+ %v0 = LB %at_64, 0 :: (load 1 from %ir.arrayidx)
+ %w1 = FILL_B killed %v0
+ %w0 = ADDV_B killed %w0, killed %w1
+ %at = LB killed %at_64, 1 :: (load 1 from %ir.arrayidx3)
+ %w1 = FILL_B killed %at
+ %w0 = ADDV_B killed %w0, killed %w1
+ %w1 = LD_B %stack.4.b.addr, 0 :: (dereferenceable load 16 from %ir.b.addr)
+ %w0 = ADDV_B killed %w1, killed %w0
+ ST_B killed %w0, %stack.4.b.addr, 0 :: (store 16 into %ir.b.addr)
+ %w0 = LD_B %stack.4.b.addr, 0 :: (dereferenceable load 16 from %ir.b.addr)
+ ST_B killed %w0, %stack.0.retval, 0 :: (store 16 into %ir.retval)
+ %v0_64 = LD %stack.0.retval, 0 :: (dereferenceable load 8 from %ir.20, align 16)
+ %v1_64 = LD %stack.0.retval, 8 :: (dereferenceable load 8 from %ir.20 + 8, align 16)
+ RetRA implicit %v0_64, implicit %v1_64
+
+...
diff --git a/test/CodeGen/Mips/msa/frameindex.ll b/test/CodeGen/Mips/msa/frameindex.ll
index f903381f9ef..9c2228d3bf6 100644
--- a/test/CodeGen/Mips/msa/frameindex.ll
+++ b/test/CodeGen/Mips/msa/frameindex.ll
@@ -18,7 +18,8 @@ define void @loadstore_v16i8_just_under_simm10() nounwind {
; MIPS32-AE: loadstore_v16i8_just_under_simm10:
%1 = alloca <16 x i8>
- %2 = alloca [496 x i8] ; Push the frame right up to 512 bytes
+ %2 = alloca [492 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 512 bytes
%3 = load volatile <16 x i8>, <16 x i8>* %1
; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 496($sp)
@@ -33,7 +34,8 @@ define void @loadstore_v16i8_just_over_simm10() nounwind {
; MIPS32-AE: loadstore_v16i8_just_over_simm10:
%1 = alloca <16 x i8>
- %2 = alloca [497 x i8] ; Push the frame just over 512 bytes
+ %2 = alloca [497 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 512 bytes
%3 = load volatile <16 x i8>, <16 x i8>* %1
; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 512
@@ -50,7 +52,8 @@ define void @loadstore_v16i8_just_under_simm16() nounwind {
; MIPS32-AE: loadstore_v16i8_just_under_simm16:
%1 = alloca <16 x i8>
- %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+ %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 32768 bytes
%3 = load volatile <16 x i8>, <16 x i8>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@@ -69,7 +72,8 @@ define void @loadstore_v16i8_just_over_simm16() nounwind {
; MIPS32-AE: loadstore_v16i8_just_over_simm16:
%1 = alloca <16 x i8>
- %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+ %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--just over 32768 bytes
%3 = load volatile <16 x i8>, <16 x i8>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@@ -121,7 +125,8 @@ define void @loadstore_v8i16_just_under_simm10() nounwind {
; MIPS32-AE: loadstore_v8i16_just_under_simm10:
%1 = alloca <8 x i16>
- %2 = alloca [1008 x i8] ; Push the frame right up to 1024 bytes
+ %2 = alloca [1004 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 1024 bytes
%3 = load volatile <8 x i16>, <8 x i16>* %1
; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 1008($sp)
@@ -136,7 +141,8 @@ define void @loadstore_v8i16_just_over_simm10() nounwind {
; MIPS32-AE: loadstore_v8i16_just_over_simm10:
%1 = alloca <8 x i16>
- %2 = alloca [1009 x i8] ; Push the frame just over 1024 bytes
+ %2 = alloca [1009 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--just over 1024 bytes
%3 = load volatile <8 x i16>, <8 x i16>* %1
; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1024
@@ -153,7 +159,8 @@ define void @loadstore_v8i16_just_under_simm16() nounwind {
; MIPS32-AE: loadstore_v8i16_just_under_simm16:
%1 = alloca <8 x i16>
- %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+ %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 32768 bytes
%3 = load volatile <8 x i16>, <8 x i16>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@@ -172,7 +179,8 @@ define void @loadstore_v8i16_just_over_simm16() nounwind {
; MIPS32-AE: loadstore_v8i16_just_over_simm16:
%1 = alloca <8 x i16>
- %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+ %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--just over 32768 bytes
%3 = load volatile <8 x i16>, <8 x i16>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@@ -224,7 +232,8 @@ define void @loadstore_v4i32_just_under_simm10() nounwind {
; MIPS32-AE: loadstore_v4i32_just_under_simm10:
%1 = alloca <4 x i32>
- %2 = alloca [2032 x i8] ; Push the frame right up to 2048 bytes
+ %2 = alloca [2028 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 2048 bytes
%3 = load volatile <4 x i32>, <4 x i32>* %1
; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 2032($sp)
@@ -239,7 +248,8 @@ define void @loadstore_v4i32_just_over_simm10() nounwind {
; MIPS32-AE: loadstore_v4i32_just_over_simm10:
%1 = alloca <4 x i32>
- %2 = alloca [2033 x i8] ; Push the frame just over 2048 bytes
+ %2 = alloca [2033 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--just over 2048 bytes
%3 = load volatile <4 x i32>, <4 x i32>* %1
; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 2048
@@ -256,7 +266,8 @@ define void @loadstore_v4i32_just_under_simm16() nounwind {
; MIPS32-AE: loadstore_v4i32_just_under_simm16:
%1 = alloca <4 x i32>
- %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+ %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot-- right up to 32768 bytes
%3 = load volatile <4 x i32>, <4 x i32>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@@ -275,7 +286,8 @@ define void @loadstore_v4i32_just_over_simm16() nounwind {
; MIPS32-AE: loadstore_v4i32_just_over_simm16:
%1 = alloca <4 x i32>
- %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+ %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--just over 32768 bytes
%3 = load volatile <4 x i32>, <4 x i32>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@@ -327,8 +339,8 @@ define void @loadstore_v2i64_just_under_simm10() nounwind {
; MIPS32-AE: loadstore_v2i64_just_under_simm10:
%1 = alloca <2 x i64>
- %2 = alloca [4080 x i8] ; Push the frame right up to 4096 bytes
-
+ %2 = alloca [4076 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 4096 bytes
%3 = load volatile <2 x i64>, <2 x i64>* %1
; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 4080($sp)
store volatile <2 x i64> %3, <2 x i64>* %1
@@ -342,7 +354,8 @@ define void @loadstore_v2i64_just_over_simm10() nounwind {
; MIPS32-AE: loadstore_v2i64_just_over_simm10:
%1 = alloca <2 x i64>
- %2 = alloca [4081 x i8] ; Push the frame just over 4096 bytes
+ %2 = alloca [4081 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--just over 4096 bytes
%3 = load volatile <2 x i64>, <2 x i64>* %1
; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 4096
@@ -359,7 +372,8 @@ define void @loadstore_v2i64_just_under_simm16() nounwind {
; MIPS32-AE: loadstore_v2i64_just_under_simm16:
%1 = alloca <2 x i64>
- %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+ %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--right up to 32768 bytes
%3 = load volatile <2 x i64>, <2 x i64>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@@ -378,7 +392,8 @@ define void @loadstore_v2i64_just_over_simm16() nounwind {
; MIPS32-AE: loadstore_v2i64_just_over_simm16:
%1 = alloca <2 x i64>
- %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+ %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill
+ ; slot--just over 32768 bytes
%3 = load volatile <2 x i64>, <2 x i64>* %1
; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
diff --git a/test/CodeGen/Mips/tailcall/tailcall.ll b/test/CodeGen/Mips/tailcall/tailcall.ll
index 3f04e1cf305..1c81335937d 100644
--- a/test/CodeGen/Mips/tailcall/tailcall.ll
+++ b/test/CodeGen/Mips/tailcall/tailcall.ll
@@ -27,7 +27,7 @@
; RUN: llc -march=mipsel -relocation-model=pic -mcpu=mips32r6 -mattr=+micromips \
; RUN: -mips-tail-calls=1 < %s | FileCheck %s -check-prefixes=ALL,PIC32MM
; RUN: llc -march=mipsel -relocation-model=static -mcpu=mips32r6 \
-; RUN: -mattr=+micromips -mips-tail-calls=1 < %s | FileCheck %s -check-prefixes=ALL,STATIC32
+; RUN: -mattr=+micromips -mips-tail-calls=1 < %s | FileCheck %s -check-prefixes=ALL,STATIC32MMR6
; RUN: llc -march=mips64el -relocation-model=pic -mcpu=mips64r6 \
; RUN: -mattr=+micromips -mips-tail-calls=1 < %s | FileCheck %s -check-prefix=PIC64R6MM
; RUN: llc -march=mips64el -relocation-model=static -mcpu=mips64r6 \
@@ -51,6 +51,7 @@ entry:
; PIC32MM: jalr $25
; PIC32R6: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; N64: jalr $25
; N64R6: jalr $25
; PIC16: jalrc
@@ -68,6 +69,7 @@ entry:
; PIC32MM: jalr $25
; PIC32R6: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; N64: jalr $25
; N64R6: jalr $25
; PIC16: jalrc
@@ -85,6 +87,7 @@ entry:
; PIC32R6: jalr $25
; PIC32MM: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; N64: jalr $25
; N64R6: jalr $25
; PIC16: jalrc
@@ -102,6 +105,7 @@ entry:
; PIC32R6: jalr $25
; PIC32MM: jalr $25
; STATIC32: jal
+; SATATIC32MMR6: jal
; PIC64: jalr $25
; STATIC64: jal
; N64R6: jalr $25
@@ -120,6 +124,7 @@ entry:
; PIC32R6: jr $25
; PIC32MM: jr
; STATIC32: j
+; STATIC32MMR6: bc
; PIC64: jr $25
; STATIC64: j
; PIC16: jalrc
@@ -161,6 +166,7 @@ entry:
; PIC32R6: jrc $25
; PIC32MM: jrc
; STATIC32: j
+; STATIC32MMR6: bc
; PIC64: jr $25
; PIC64R6: jrc $25
; PIC64R6MM: jr $25
@@ -178,6 +184,7 @@ entry:
; PIC32R6: jalr $25
; PIC32MM: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; PIC64: jalr $25
; STATIC64: jal
; PIC16: jalrc
@@ -199,6 +206,7 @@ entry:
; PIC32R6: jrc $25
; PIC32MM: jrc
; STATIC32: j
+; STATIC32MMR6: bc
; PIC64: jr $25
; STATIC64: j
; PIC64R6: jrc $25
@@ -214,6 +222,7 @@ entry:
; PIC32R6: jalrc $25
; PIC32MM: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; STATIC64: jal
; PIC64: jalr $25
; PIC64R6: jalrc $25
@@ -232,6 +241,7 @@ entry:
; PIC32R6: jalr $25
; PIC32MM: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; STATIC64: jal
; PIC64: jalr $25
; PIC64R6: jalr $25
@@ -250,6 +260,7 @@ entry:
; PIC32R6: jalrc $25
; PIC32MM: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; STATIC64: jal
; PIC64: jalr $25
; PIC64R6: jalrc $25
@@ -270,6 +281,7 @@ entry:
; PIC32R6: jalrc $25
; PIC32MM: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; STATIC64: jal
; PIC64: jalr $25
; PIC64R6: jalrc $25
@@ -290,6 +302,7 @@ entry:
; PIC32R6: jalr $25
; PIC32MM: jalr $25
; STATIC32: jal
+; STATIC32MMR6: jal
; STATIC64: jal
; PIC64R6: jalr $25
; PIC64: jalr $25
diff --git a/test/CodeGen/NVPTX/atomics-sm60.ll b/test/CodeGen/NVPTX/atomics-sm60.ll
new file mode 100644
index 00000000000..0b5bafb780c
--- /dev/null
+++ b/test/CodeGen/NVPTX/atomics-sm60.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s
+
+; CHECK-LABEL .func test(
+define void @test(double* %dp0, double addrspace(1)* %dp1, double addrspace(3)* %dp3, double %d) {
+; CHECK: atom.add.f64
+ %r1 = call double @llvm.nvvm.atomic.load.add.f64.p0f64(double* %dp0, double %d)
+; CHECK: atom.global.add.f64
+ %r2 = call double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* %dp1, double %d)
+; CHECK: atom.shared.add.f64
+ %ret = call double @llvm.nvvm.atomic.load.add.f64.p3f64(double addrspace(3)* %dp3, double %d)
+ ret void
+}
+
+declare double @llvm.nvvm.atomic.load.add.f64.p0f64(double* nocapture, double) #1
+declare double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* nocapture, double) #1
+declare double @llvm.nvvm.atomic.load.add.f64.p3f64(double addrspace(3)* nocapture, double) #1
+
+attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll b/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll
index f874148c0e8..5df5183dc2f 100644
--- a/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll
+++ b/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll
@@ -1,6 +1,6 @@
; Verify functionality of NVPTXGenericToNVVM.cpp pass.
;
-; RUN: opt < %s -march nvptx64 -S -generic-to-nvvm -verify-debug-info | FileCheck %s
+; RUN: opt < %s -march nvptx64 -S -generic-to-nvvm | FileCheck %s
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
diff --git a/test/CodeGen/PowerPC/bswap64.ll b/test/CodeGen/PowerPC/bswap64.ll
new file mode 100644
index 00000000000..0a78aa2dc54
--- /dev/null
+++ b/test/CodeGen/PowerPC/bswap64.ll
@@ -0,0 +1,13 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64le-- -mcpu=pwr9 | FileCheck %s
+
+declare i64 @llvm.bswap.i64(i64)
+
+; CHECK: mtvsrdd
+; CHECK: xxbrd
+; CHECK: mfvsrd
+define i64 @bswap64(i64 %x) {
+entry:
+ %0 = call i64 @llvm.bswap.i64(i64 %x)
+ ret i64 %0
+}
+
diff --git a/test/CodeGen/PowerPC/p9-vinsert-vextract.ll b/test/CodeGen/PowerPC/p9-vinsert-vextract.ll
index 31bbc4b1351..c8c7d797c00 100644
--- a/test/CodeGen/PowerPC/p9-vinsert-vextract.ll
+++ b/test/CodeGen/PowerPC/p9-vinsert-vextract.ll
@@ -298,3 +298,825 @@ entry:
ret <8 x i16> %vecins
}
+; The following testcases take one byte element from the second vector and
+; inserts it at various locations in the first vector
+define <16 x i8> @shuffle_vector_byte_0_16(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_0_16
+; CHECK: vsldoi 3, 3, 3, 8
+; CHECK: vinsertb 2, 3, 15
+; CHECK-BE-LABEL: shuffle_vector_byte_0_16
+; CHECK-BE: vsldoi 3, 3, 3, 9
+; CHECK-BE: vinsertb 2, 3, 0
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_1_25(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_1_25
+; CHECK: vsldoi 3, 3, 3, 15
+; CHECK: vinsertb 2, 3, 14
+; CHECK-BE-LABEL: shuffle_vector_byte_1_25
+; CHECK-BE: vsldoi 3, 3, 3, 2
+; CHECK-BE: vinsertb 2, 3, 1
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 25, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_2_18(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_2_18
+; CHECK: vsldoi 3, 3, 3, 6
+; CHECK: vinsertb 2, 3, 13
+; CHECK-BE-LABEL: shuffle_vector_byte_2_18
+; CHECK-BE: vsldoi 3, 3, 3, 11
+; CHECK-BE: vinsertb 2, 3, 2
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_3_27(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_3_27
+; CHECK: vsldoi 3, 3, 3, 13
+; CHECK: vinsertb 2, 3, 12
+; CHECK-BE-LABEL: shuffle_vector_byte_3_27
+; CHECK-BE: vsldoi 3, 3, 3, 4
+; CHECK-BE: vinsertb 2, 3, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 27, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_4_20(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_4_20
+; CHECK: vsldoi 3, 3, 3, 4
+; CHECK: vinsertb 2, 3, 11
+; CHECK-BE-LABEL: shuffle_vector_byte_4_20
+; CHECK-BE: vsldoi 3, 3, 3, 13
+; CHECK-BE: vinsertb 2, 3, 4
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_5_29(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_5_29
+; CHECK: vsldoi 3, 3, 3, 11
+; CHECK: vinsertb 2, 3, 10
+; CHECK-BE-LABEL: shuffle_vector_byte_5_29
+; CHECK-BE: vsldoi 3, 3, 3, 6
+; CHECK-BE: vinsertb 2, 3, 5
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 29, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_6_22(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_6_22
+; CHECK: vsldoi 3, 3, 3, 2
+; CHECK: vinsertb 2, 3, 9
+; CHECK-BE-LABEL: shuffle_vector_byte_6_22
+; CHECK-BE: vsldoi 3, 3, 3, 15
+; CHECK-BE: vinsertb 2, 3, 6
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 22, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_7_31(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_7_31
+; CHECK: vsldoi 3, 3, 3, 9
+; CHECK: vinsertb 2, 3, 8
+; CHECK-BE-LABEL: shuffle_vector_byte_7_31
+; CHECK-BE: vsldoi 3, 3, 3, 8
+; CHECK-BE: vinsertb 2, 3, 7
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_8_24(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_8_24
+; CHECK: vinsertb 2, 3, 7
+; CHECK-BE-LABEL: shuffle_vector_byte_8_24
+; CHECK-BE: vsldoi 3, 3, 3, 1
+; CHECK-BE: vinsertb 2, 3, 8
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_9_17(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_9_17
+; CHECK: vsldoi 3, 3, 3, 7
+; CHECK: vinsertb 2, 3, 6
+; CHECK-BE-LABEL: shuffle_vector_byte_9_17
+; CHECK-BE: vsldoi 3, 3, 3, 10
+; CHECK-BE: vinsertb 2, 3, 9
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_10_26(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_10_26
+; CHECK: vsldoi 3, 3, 3, 14
+; CHECK: vinsertb 2, 3, 5
+; CHECK-BE-LABEL: shuffle_vector_byte_10_26
+; CHECK-BE: vsldoi 3, 3, 3, 3
+; CHECK-BE: vinsertb 2, 3, 10
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_11_19(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_11_19
+; CHECK: vsldoi 3, 3, 3, 5
+; CHECK: vinsertb 2, 3, 4
+; CHECK-BE-LABEL: shuffle_vector_byte_11_19
+; CHECK-BE: vsldoi 3, 3, 3, 12
+; CHECK-BE: vinsertb 2, 3, 11
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 19, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_12_28(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_12_28
+; CHECK: vsldoi 3, 3, 3, 12
+; CHECK: vinsertb 2, 3, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_12_28
+; CHECK-BE: vsldoi 3, 3, 3, 5
+; CHECK-BE: vinsertb 2, 3, 12
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_13_21(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_13_21
+; CHECK: vsldoi 3, 3, 3, 3
+; CHECK: vinsertb 2, 3, 2
+; CHECK-BE-LABEL: shuffle_vector_byte_13_21
+; CHECK-BE: vsldoi 3, 3, 3, 14
+; CHECK-BE: vinsertb 2, 3, 13
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 21, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_14_30(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_14_30
+; CHECK: vsldoi 3, 3, 3, 10
+; CHECK: vinsertb 2, 3, 1
+; CHECK-BE-LABEL: shuffle_vector_byte_14_30
+; CHECK-BE: vsldoi 3, 3, 3, 7
+; CHECK-BE: vinsertb 2, 3, 14
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 30, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_15_23(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_15_23
+; CHECK: vsldoi 3, 3, 3, 1
+; CHECK: vinsertb 2, 3, 0
+; CHECK-BE-LABEL: shuffle_vector_byte_15_23
+; CHECK-BE: vinsertb 2, 3, 15
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 23>
+ ret <16 x i8> %vecins
+}
+
+; The following testcases take one byte element from the first vector and
+; inserts it at various locations in the second vector
+define <16 x i8> @shuffle_vector_byte_16_8(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_16_8
+; CHECK: vinsertb 3, 2, 15
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_16_8
+; CHECK-BE: vsldoi 2, 2, 2, 1
+; CHECK-BE: vinsertb 3, 2, 0
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_17_1(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_17_1
+; CHECK: vsldoi 2, 2, 2, 7
+; CHECK: vinsertb 3, 2, 14
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_17_1
+; CHECK-BE: vsldoi 2, 2, 2, 10
+; CHECK-BE: vinsertb 3, 2, 1
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_18_10(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_18_10
+; CHECK: vsldoi 2, 2, 2, 14
+; CHECK: vinsertb 3, 2, 13
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_18_10
+; CHECK-BE: vsldoi 2, 2, 2, 3
+; CHECK-BE: vinsertb 3, 2, 2
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 10, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_19_3(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_19_3
+; CHECK: vsldoi 2, 2, 2, 5
+; CHECK: vinsertb 3, 2, 12
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_19_3
+; CHECK-BE: vsldoi 2, 2, 2, 12
+; CHECK-BE: vinsertb 3, 2, 3
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_20_12(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_20_12
+; CHECK: vsldoi 2, 2, 2, 12
+; CHECK: vinsertb 3, 2, 11
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_20_12
+; CHECK-BE: vsldoi 2, 2, 2, 5
+; CHECK-BE: vinsertb 3, 2, 4
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 12, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_21_5(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_21_5
+; CHECK: vsldoi 2, 2, 2, 3
+; CHECK: vinsertb 3, 2, 10
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_21_5
+; CHECK-BE: vsldoi 2, 2, 2, 14
+; CHECK-BE: vinsertb 3, 2, 5
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 5, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_22_14(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_22_14
+; CHECK: vsldoi 2, 2, 2, 10
+; CHECK: vinsertb 3, 2, 9
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_22_14
+; CHECK-BE: vsldoi 2, 2, 2, 7
+; CHECK-BE: vinsertb 3, 2, 6
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 14, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_23_7(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_23_7
+; CHECK: vsldoi 2, 2, 2, 1
+; CHECK: vinsertb 3, 2, 8
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_23_7
+; CHECK-BE: vinsertb 3, 2, 7
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_24_0(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_24_0
+; CHECK: vsldoi 2, 2, 2, 8
+; CHECK: vinsertb 3, 2, 7
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_24_0
+; CHECK-BE: vsldoi 2, 2, 2, 9
+; CHECK-BE: vinsertb 3, 2, 8
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_25_9(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_25_9
+; CHECK: vsldoi 2, 2, 2, 15
+; CHECK: vinsertb 3, 2, 6
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_25_9
+; CHECK-BE: vsldoi 2, 2, 2, 2
+; CHECK-BE: vinsertb 3, 2, 9
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 9, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_26_2(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_26_2
+; CHECK: vsldoi 2, 2, 2, 6
+; CHECK: vinsertb 3, 2, 5
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_26_2
+; CHECK-BE: vsldoi 2, 2, 2, 11
+; CHECK-BE: vinsertb 3, 2, 10
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 2, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_27_11(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_27_11
+; CHECK: vsldoi 2, 2, 2, 13
+; CHECK: vinsertb 3, 2, 4
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_27_11
+; CHECK-BE: vsldoi 2, 2, 2, 4
+; CHECK-BE: vinsertb 3, 2, 11
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 11, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_28_4(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_28_4
+; CHECK: vsldoi 2, 2, 2, 4
+; CHECK: vinsertb 3, 2, 3
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_28_4
+; CHECK-BE: vsldoi 2, 2, 2, 13
+; CHECK-BE: vinsertb 3, 2, 12
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 4, i32 29, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_29_13(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_29_13
+; CHECK: vsldoi 2, 2, 2, 11
+; CHECK: vinsertb 3, 2, 2
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_29_13
+; CHECK-BE: vsldoi 2, 2, 2, 6
+; CHECK-BE: vinsertb 3, 2, 13
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 13, i32 30, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_30_6(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_30_6
+; CHECK: vsldoi 2, 2, 2, 2
+; CHECK: vinsertb 3, 2, 1
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_30_6
+; CHECK-BE: vsldoi 2, 2, 2, 15
+; CHECK-BE: vinsertb 3, 2, 14
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 6, i32 31>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_31_15(<16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_31_15
+; CHECK: vsldoi 2, 2, 2, 9
+; CHECK: vinsertb 3, 2, 0
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_31_15
+; CHECK-BE: vsldoi 2, 2, 2, 8
+; CHECK-BE: vinsertb 3, 2, 15
+; CHECK-BE: vmr 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 15>
+ ret <16 x i8> %vecins
+}
+
+; The following testcases use the same vector in both arguments of the
+; shufflevector. If byte element 7 in BE mode(or 8 in LE mode) is the one
+; we're attempting to insert, then we can use the vector insert instruction
+define <16 x i8> @shuffle_vector_byte_0_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_0_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_0_7
+; CHECK-BE: vinsertb 2, 2, 0
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_1_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_1_8
+; CHECK: vinsertb 2, 2, 14
+; CHECK-BE-LABEL: shuffle_vector_byte_1_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 8, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_2_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_2_8
+; CHECK: vinsertb 2, 2, 13
+; CHECK-BE-LABEL: shuffle_vector_byte_2_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_3_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_3_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_3_7
+; CHECK-BE: vinsertb 2, 2, 3
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_4_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_4_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_4_7
+; CHECK-BE: vinsertb 2, 2, 4
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_5_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_5_8
+; CHECK: vinsertb 2, 2, 10
+; CHECK-BE-LABEL: shuffle_vector_byte_5_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_6_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_6_8
+; CHECK: vinsertb 2, 2, 9
+; CHECK-BE-LABEL: shuffle_vector_byte_6_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_7_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_7_8
+; CHECK: vinsertb 2, 2, 8
+; CHECK-BE-LABEL: shuffle_vector_byte_7_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 8, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_8_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_8_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_8_7
+; CHECK-BE: vinsertb 2, 2, 8
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_9_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_9_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_9_7
+; CHECK-BE: vinsertb 2, 2, 9
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 7, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_10_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_10_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_10_7
+; CHECK-BE: vinsertb 2, 2, 10
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 7, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_11_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_11_8
+; CHECK: vinsertb 2, 2, 4
+; CHECK-BE-LABEL: shuffle_vector_byte_11_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 8, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_12_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_12_8
+; CHECK: vinsertb 2, 2, 3
+; CHECK-BE-LABEL: shuffle_vector_byte_12_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 8, i32 13, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_13_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_13_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_13_7
+; CHECK-BE: vinsertb 2, 2, 13
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 7, i32 14, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_14_7(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_14_7
+; CHECK-NOT: vinsertb
+; CHECK-BE-LABEL: shuffle_vector_byte_14_7
+; CHECK-BE: vinsertb 2, 2, 14
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 7, i32 15>
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_15_8(<16 x i8> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_byte_15_8
+; CHECK: vinsertb 2, 2, 0
+; CHECK-BE-LABEL: shuffle_vector_byte_15_8
+; CHECK-BE-NOT: vinsertb
+ %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 8>
+ ret <16 x i8> %vecins
+}
+
+; The following tests try to insert one halfword element into the vector. We
+; should always be using the 'vinserth' instruction.
+define <8 x i16> @insert_halfword_0(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_0
+; CHECK: vinserth 2, 3, 14
+; CHECK-BE-LABEL: insert_halfword_0
+; CHECK-BE: vinserth 2, 3, 0
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 0
+ ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_1(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_1
+; CHECK: vinserth 2, 3, 12
+; CHECK-BE-LABEL: insert_halfword_1
+; CHECK-BE: vinserth 2, 3, 2
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 1
+ ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_2(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_2
+; CHECK: vinserth 2, 3, 10
+; CHECK-BE-LABEL: insert_halfword_2
+; CHECK-BE: vinserth 2, 3, 4
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 2
+ ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_3(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_3
+; CHECK: vinserth 2, 3, 8
+; CHECK-BE-LABEL: insert_halfword_3
+; CHECK-BE: vinserth 2, 3, 6
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 3
+ ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_4(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_4
+; CHECK: vinserth 2, 3, 6
+; CHECK-BE-LABEL: insert_halfword_4
+; CHECK-BE: vinserth 2, 3, 8
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 4
+ ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_5(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_5
+; CHECK: vinserth 2, 3, 4
+; CHECK-BE-LABEL: insert_halfword_5
+; CHECK-BE: vinserth 2, 3, 10
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 5
+ ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_6(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_6
+; CHECK: vinserth 2, 3, 2
+; CHECK-BE-LABEL: insert_halfword_6
+; CHECK-BE: vinserth 2, 3, 12
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 6
+ ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_7(<8 x i16> %a, i16 %b) {
+entry:
+; CHECK-LABEL: insert_halfword_7
+; CHECK: vinserth 2, 3, 0
+; CHECK-BE-LABEL: insert_halfword_7
+; CHECK-BE: vinserth 2, 3, 14
+ %vecins = insertelement <8 x i16> %a, i16 %b, i32 7
+ ret <8 x i16> %vecins
+}
+
+; The following tests try to insert one byte element into the vector. We
+; should always be using the 'vinsertb' instruction.
+define <16 x i8> @insert_byte_0(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_0
+; CHECK: vinsertb 2, 3, 15
+; CHECK-BE-LABEL: insert_byte_0
+; CHECK-BE: vinsertb 2, 3, 0
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 0
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_1(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_1
+; CHECK: vinsertb 2, 3, 14
+; CHECK-BE-LABEL: insert_byte_1
+; CHECK-BE: vinsertb 2, 3, 1
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 1
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_2(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_2
+; CHECK: vinsertb 2, 3, 13
+; CHECK-BE-LABEL: insert_byte_2
+; CHECK-BE: vinsertb 2, 3, 2
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 2
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_3(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_3
+; CHECK: vinsertb 2, 3, 12
+; CHECK-BE-LABEL: insert_byte_3
+; CHECK-BE: vinsertb 2, 3, 3
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 3
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_4(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_4
+; CHECK: vinsertb 2, 3, 11
+; CHECK-BE-LABEL: insert_byte_4
+; CHECK-BE: vinsertb 2, 3, 4
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 4
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_5(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_5
+; CHECK: vinsertb 2, 3, 10
+; CHECK-BE-LABEL: insert_byte_5
+; CHECK-BE: vinsertb 2, 3, 5
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 5
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_6(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_6
+; CHECK: vinsertb 2, 3, 9
+; CHECK-BE-LABEL: insert_byte_6
+; CHECK-BE: vinsertb 2, 3, 6
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 6
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_7(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_7
+; CHECK: vinsertb 2, 3, 8
+; CHECK-BE-LABEL: insert_byte_7
+; CHECK-BE: vinsertb 2, 3, 7
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 7
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_8(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_8
+; CHECK: vinsertb 2, 3, 7
+; CHECK-BE-LABEL: insert_byte_8
+; CHECK-BE: vinsertb 2, 3, 8
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 8
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_9(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_9
+; CHECK: vinsertb 2, 3, 6
+; CHECK-BE-LABEL: insert_byte_9
+; CHECK-BE: vinsertb 2, 3, 9
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 9
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_10(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_10
+; CHECK: vinsertb 2, 3, 5
+; CHECK-BE-LABEL: insert_byte_10
+; CHECK-BE: vinsertb 2, 3, 10
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 10
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_11(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_11
+; CHECK: vinsertb 2, 3, 4
+; CHECK-BE-LABEL: insert_byte_11
+; CHECK-BE: vinsertb 2, 3, 11
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 11
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_12(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_12
+; CHECK: vinsertb 2, 3, 3
+; CHECK-BE-LABEL: insert_byte_12
+; CHECK-BE: vinsertb 2, 3, 12
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 12
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_13(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_13
+; CHECK: vinsertb 2, 3, 2
+; CHECK-BE-LABEL: insert_byte_13
+; CHECK-BE: vinsertb 2, 3, 13
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 13
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_14(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_14
+; CHECK: vinsertb 2, 3, 1
+; CHECK-BE-LABEL: insert_byte_14
+; CHECK-BE: vinsertb 2, 3, 14
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 14
+ ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_15(<16 x i8> %a, i8 %b) {
+entry:
+; CHECK-LABEL: insert_byte_15
+; CHECK: vinsertb 2, 3, 0
+; CHECK-BE-LABEL: insert_byte_15
+; CHECK-BE: vinsertb 2, 3, 15
+ %vecins = insertelement <16 x i8> %a, i8 %b, i32 15
+ ret <16 x i8> %vecins
+}
diff --git a/test/CodeGen/PowerPC/subreg-postra-2.ll b/test/CodeGen/PowerPC/subreg-postra-2.ll
index 338000cd8ba..794c9c190d1 100644
--- a/test/CodeGen/PowerPC/subreg-postra-2.ll
+++ b/test/CodeGen/PowerPC/subreg-postra-2.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gep-opt=0 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false -ppc-gep-opt=0 < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
@@ -38,10 +38,10 @@ while.end418: ; preds = %wait_on_buffer.exit
; CHECK: stdcx.
; CHECK: isel {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, [[REG]]
; CHECK-NO-ISEL: bc 12, 20, [[TRUE:.LBB[0-9]+]]
-; CHECK-NO-ISEL: ori 4, 7, 0
+; CHECK-NO-ISEL: ori 7, 8, 0
; CHECK-NO-ISEL-NEXT: b [[SUCCESSOR:.LBB[0-9]+]]
; CHECK-NO-ISEL: [[TRUE]]
-; CHECK-NO-ISEL-NEXT: addi 4, 3, 0
+; CHECK-NO-ISEL: addi 7, 3, 0
if.then420: ; preds = %while.end418
unreachable
diff --git a/test/CodeGen/RISCV/alu32.ll b/test/CodeGen/RISCV/alu32.ll
index 32242d2e40d..9db6bb9dd43 100644
--- a/test/CodeGen/RISCV/alu32.ll
+++ b/test/CodeGen/RISCV/alu32.ll
@@ -7,7 +7,6 @@ define i32 @addi(i32 %a) nounwind {
; RV32I-LABEL: addi:
; RV32I: addi a0, a0, 1
; RV32I: jalr zero, ra, 0
-; TODO: check support for materialising larger constants
%1 = add i32 %a, 1
ret i32 %1
}
diff --git a/test/CodeGen/RISCV/branch.ll b/test/CodeGen/RISCV/branch.ll
new file mode 100644
index 00000000000..194083b07c7
--- /dev/null
+++ b/test/CodeGen/RISCV/branch.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefix=RV32I %s
+
+define void @foo(i32 %a, i32 *%b, i1 %c) {
+; RV32I-LABEL: foo:
+; RV32I: # BB#0:
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: beq a3, a0, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_1
+; RV32I-NEXT: .LBB0_1: # %test2
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: bne a3, a0, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_2
+; RV32I-NEXT: .LBB0_2: # %test3
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: blt a3, a0, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_3
+; RV32I-NEXT: .LBB0_3: # %test4
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: bge a3, a0, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_4
+; RV32I-NEXT: .LBB0_4: # %test5
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: bltu a3, a0, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_5
+; RV32I-NEXT: .LBB0_5: # %test6
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: bgeu a3, a0, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_6
+; RV32I-NEXT: .LBB0_6: # %test7
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: blt a0, a3, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_7
+; RV32I-NEXT: .LBB0_7: # %test8
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: bge a0, a3, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_8
+; RV32I-NEXT: .LBB0_8: # %test9
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: bltu a0, a3, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_9
+; RV32I-NEXT: .LBB0_9: # %test10
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: bgeu a0, a3, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_10
+; RV32I-NEXT: .LBB0_10: # %test11
+; RV32I-NEXT: lw a0, 0(a1)
+; RV32I-NEXT: andi a0, a2, 1
+; RV32I-NEXT: bne a0, zero, .LBB0_12
+; RV32I-NEXT: jal zero, .LBB0_11
+; RV32I-NEXT: .LBB0_11: # %test12
+; RV32I-NEXT: lw a0, 0(a1)
+; RV32I-NEXT: .LBB0_12: # %end
+; RV32I-NEXT: jalr zero, ra, 0
+
+ %val1 = load volatile i32, i32* %b
+ %tst1 = icmp eq i32 %val1, %a
+ br i1 %tst1, label %end, label %test2
+
+test2:
+ %val2 = load volatile i32, i32* %b
+ %tst2 = icmp ne i32 %val2, %a
+ br i1 %tst2, label %end, label %test3
+
+test3:
+ %val3 = load volatile i32, i32* %b
+ %tst3 = icmp slt i32 %val3, %a
+ br i1 %tst3, label %end, label %test4
+
+test4:
+ %val4 = load volatile i32, i32* %b
+ %tst4 = icmp sge i32 %val4, %a
+ br i1 %tst4, label %end, label %test5
+
+test5:
+ %val5 = load volatile i32, i32* %b
+ %tst5 = icmp ult i32 %val5, %a
+ br i1 %tst5, label %end, label %test6
+
+test6:
+ %val6 = load volatile i32, i32* %b
+ %tst6 = icmp uge i32 %val6, %a
+ br i1 %tst6, label %end, label %test7
+
+; Check for condition codes that don't have a matching instruction
+
+test7:
+ %val7 = load volatile i32, i32* %b
+ %tst7 = icmp sgt i32 %val7, %a
+ br i1 %tst7, label %end, label %test8
+
+test8:
+ %val8 = load volatile i32, i32* %b
+ %tst8 = icmp sle i32 %val8, %a
+ br i1 %tst8, label %end, label %test9
+
+test9:
+ %val9 = load volatile i32, i32* %b
+ %tst9 = icmp ugt i32 %val9, %a
+ br i1 %tst9, label %end, label %test10
+
+test10:
+ %val10 = load volatile i32, i32* %b
+ %tst10 = icmp ule i32 %val10, %a
+ br i1 %tst10, label %end, label %test11
+
+; Check the case of a branch where the condition was generated in another
+; function
+
+test11:
+ %val11 = load volatile i32, i32* %b
+ br i1 %c, label %end, label %test12
+
+test12:
+ %val12 = load volatile i32, i32* %b
+ br label %end
+
+end:
+ ret void
+}
diff --git a/test/CodeGen/RISCV/calls.ll b/test/CodeGen/RISCV/calls.ll
new file mode 100644
index 00000000000..8abe5e92a8e
--- /dev/null
+++ b/test/CodeGen/RISCV/calls.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefix=RV32I %s
+
+declare i32 @external_function(i32)
+
+define i32 @test_call_external(i32 %a) nounwind {
+; RV32I-LABEL: test_call_external:
+; RV32I: # BB#0:
+; RV32I-NEXT: sw ra, 12(s0)
+; RV32I-NEXT: lui a1, %hi(external_function)
+; RV32I-NEXT: addi a1, a1, %lo(external_function)
+; RV32I-NEXT: jalr ra, a1, 0
+; RV32I-NEXT: lw ra, 12(s0)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = call i32 @external_function(i32 %a)
+ ret i32 %1
+}
+
+define i32 @defined_function(i32 %a) nounwind {
+; RV32I-LABEL: defined_function:
+; RV32I: # BB#0:
+; RV32I-NEXT: addi a0, a0, 1
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = add i32 %a, 1
+ ret i32 %1
+}
+
+define i32 @test_call_defined(i32 %a) nounwind {
+; RV32I-LABEL: test_call_defined:
+; RV32I: # BB#0:
+; RV32I-NEXT: sw ra, 12(s0)
+; RV32I-NEXT: lui a1, %hi(defined_function)
+; RV32I-NEXT: addi a1, a1, %lo(defined_function)
+; RV32I-NEXT: jalr ra, a1, 0
+; RV32I-NEXT: lw ra, 12(s0)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = call i32 @defined_function(i32 %a) nounwind
+ ret i32 %1
+}
+
+define i32 @test_call_indirect(i32 (i32)* %a, i32 %b) nounwind {
+; RV32I-LABEL: test_call_indirect:
+; RV32I: # BB#0:
+; RV32I-NEXT: sw ra, 12(s0)
+; RV32I-NEXT: addi a2, a0, 0
+; RV32I-NEXT: addi a0, a1, 0
+; RV32I-NEXT: jalr ra, a2, 0
+; RV32I-NEXT: lw ra, 12(s0)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = call i32 %a(i32 %b)
+ ret i32 %1
+}
+
+; Ensure that calls to fastcc functions aren't rejected. Such calls may be
+; introduced when compiling with optimisation.
+
+define fastcc i32 @fastcc_function(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: fastcc_function:
+; RV32I: # BB#0:
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = add i32 %a, %b
+ ret i32 %1
+}
+
+define i32 @test_call_fastcc(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: test_call_fastcc:
+; RV32I: # BB#0:
+; RV32I-NEXT: sw ra, 12(s0)
+; RV32I-NEXT: sw s1, 8(s0)
+; RV32I-NEXT: addi s1, a0, 0
+; RV32I-NEXT: lui a0, %hi(fastcc_function)
+; RV32I-NEXT: addi a2, a0, %lo(fastcc_function)
+; RV32I-NEXT: addi a0, s1, 0
+; RV32I-NEXT: jalr ra, a2, 0
+; RV32I-NEXT: addi a0, s1, 0
+; RV32I-NEXT: lw s1, 8(s0)
+; RV32I-NEXT: lw ra, 12(s0)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = call fastcc i32 @fastcc_function(i32 %a, i32 %b)
+ ret i32 %a
+}
diff --git a/test/CodeGen/RISCV/imm.ll b/test/CodeGen/RISCV/imm.ll
new file mode 100644
index 00000000000..c52638da02e
--- /dev/null
+++ b/test/CodeGen/RISCV/imm.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s -check-prefix=RV32I
+
+; Materializing constants
+
+define i32 @zero() nounwind {
+; RV32I-LABEL: zero:
+; RV32I: # BB#0:
+; RV32I-NEXT: addi a0, zero, 0
+; RV32I-NEXT: jalr zero, ra, 0
+ ret i32 0
+}
+
+define i32 @pos_small() nounwind {
+; RV32I-LABEL: pos_small:
+; RV32I: # BB#0:
+; RV32I-NEXT: addi a0, zero, 2047
+; RV32I-NEXT: jalr zero, ra, 0
+ ret i32 2047
+}
+
+define i32 @neg_small() nounwind {
+; RV32I-LABEL: neg_small:
+; RV32I: # BB#0:
+; RV32I-NEXT: addi a0, zero, -2048
+; RV32I-NEXT: jalr zero, ra, 0
+ ret i32 -2048
+}
+
+define i32 @pos_i32() nounwind {
+; RV32I-LABEL: pos_i32:
+; RV32I: # BB#0:
+; RV32I-NEXT: lui a0, 423811
+; RV32I-NEXT: addi a0, a0, -1297
+; RV32I-NEXT: jalr zero, ra, 0
+ ret i32 1735928559
+}
+
+define i32 @neg_i32() nounwind {
+; RV32I-LABEL: neg_i32:
+; RV32I: # BB#0:
+; RV32I-NEXT: lui a0, 912092
+; RV32I-NEXT: addi a0, a0, -273
+; RV32I-NEXT: jalr zero, ra, 0
+ ret i32 -559038737
+}
diff --git a/test/CodeGen/RISCV/mem.ll b/test/CodeGen/RISCV/mem.ll
new file mode 100644
index 00000000000..b06382f8742
--- /dev/null
+++ b/test/CodeGen/RISCV/mem.ll
@@ -0,0 +1,202 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s -check-prefix=RV32I
+
+; Check indexed and unindexed, sext, zext and anyext loads
+
+define i32 @lb(i8 *%a) nounwind {
+; RV32I-LABEL: lb:
+; RV32I: # BB#0:
+; RV32I-NEXT: lb a1, 0(a0)
+; RV32I-NEXT: lb a0, 1(a0)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = getelementptr i8, i8* %a, i32 1
+ %2 = load i8, i8* %1
+ %3 = sext i8 %2 to i32
+ ; the unused load will produce an anyext for selection
+ %4 = load volatile i8, i8* %a
+ ret i32 %3
+}
+
+define i32 @lh(i16 *%a) nounwind {
+; RV32I-LABEL: lh:
+; RV32I: # BB#0:
+; RV32I-NEXT: lh a1, 0(a0)
+; RV32I-NEXT: lh a0, 4(a0)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = getelementptr i16, i16* %a, i32 2
+ %2 = load i16, i16* %1
+ %3 = sext i16 %2 to i32
+ ; the unused load will produce an anyext for selection
+ %4 = load volatile i16, i16* %a
+ ret i32 %3
+}
+
+define i32 @lw(i32 *%a) nounwind {
+; RV32I-LABEL: lw:
+; RV32I: # BB#0:
+; RV32I-NEXT: lw a1, 0(a0)
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = getelementptr i32, i32* %a, i32 3
+ %2 = load i32, i32* %1
+ %3 = load volatile i32, i32* %a
+ ret i32 %2
+}
+
+define i32 @lbu(i8 *%a) nounwind {
+; RV32I-LABEL: lbu:
+; RV32I: # BB#0:
+; RV32I-NEXT: lbu a1, 0(a0)
+; RV32I-NEXT: lbu a0, 4(a0)
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = getelementptr i8, i8* %a, i32 4
+ %2 = load i8, i8* %1
+ %3 = zext i8 %2 to i32
+ %4 = load volatile i8, i8* %a
+ %5 = zext i8 %4 to i32
+ %6 = add i32 %3, %5
+ ret i32 %6
+}
+
+define i32 @lhu(i16 *%a) nounwind {
+; RV32I-LABEL: lhu:
+; RV32I: # BB#0:
+; RV32I-NEXT: lhu a1, 0(a0)
+; RV32I-NEXT: lhu a0, 10(a0)
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = getelementptr i16, i16* %a, i32 5
+ %2 = load i16, i16* %1
+ %3 = zext i16 %2 to i32
+ %4 = load volatile i16, i16* %a
+ %5 = zext i16 %4 to i32
+ %6 = add i32 %3, %5
+ ret i32 %6
+}
+
+; Check indexed and unindexed stores
+
+define void @sb(i8 *%a, i8 %b) nounwind {
+; RV32I-LABEL: sb:
+; RV32I: # BB#0:
+; RV32I-NEXT: sb a1, 6(a0)
+; RV32I-NEXT: sb a1, 0(a0)
+; RV32I-NEXT: jalr zero, ra, 0
+ store i8 %b, i8* %a
+ %1 = getelementptr i8, i8* %a, i32 6
+ store i8 %b, i8* %1
+ ret void
+}
+
+define void @sh(i16 *%a, i16 %b) nounwind {
+; RV32I-LABEL: sh:
+; RV32I: # BB#0:
+; RV32I-NEXT: sh a1, 14(a0)
+; RV32I-NEXT: sh a1, 0(a0)
+; RV32I-NEXT: jalr zero, ra, 0
+ store i16 %b, i16* %a
+ %1 = getelementptr i16, i16* %a, i32 7
+ store i16 %b, i16* %1
+ ret void
+}
+
+define void @sw(i32 *%a, i32 %b) nounwind {
+; RV32I-LABEL: sw:
+; RV32I: # BB#0:
+; RV32I-NEXT: sw a1, 32(a0)
+; RV32I-NEXT: sw a1, 0(a0)
+; RV32I-NEXT: jalr zero, ra, 0
+ store i32 %b, i32* %a
+ %1 = getelementptr i32, i32* %a, i32 8
+ store i32 %b, i32* %1
+ ret void
+}
+
+; Check load and store to an i1 location
+define i32 @load_sext_zext_anyext_i1(i1 *%a) nounwind {
+; RV32I-LABEL: load_sext_zext_anyext_i1:
+; RV32I: # BB#0:
+; RV32I-NEXT: lb a1, 0(a0)
+; RV32I-NEXT: lbu a1, 1(a0)
+; RV32I-NEXT: lbu a0, 2(a0)
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: jalr zero, ra, 0
+ ; sextload i1
+ %1 = getelementptr i1, i1* %a, i32 1
+ %2 = load i1, i1* %1
+ %3 = sext i1 %2 to i32
+ ; zextload i1
+ %4 = getelementptr i1, i1* %a, i32 2
+ %5 = load i1, i1* %4
+ %6 = zext i1 %5 to i32
+ %7 = add i32 %3, %6
+ ; extload i1 (anyext). Produced as the load is unused.
+ %8 = load volatile i1, i1* %a
+ ret i32 %7
+}
+
+define i16 @load_sext_zext_anyext_i1_i16(i1 *%a) nounwind {
+; RV32I-LABEL: load_sext_zext_anyext_i1_i16:
+; RV32I: # BB#0:
+; RV32I-NEXT: lb a1, 0(a0)
+; RV32I-NEXT: lbu a1, 1(a0)
+; RV32I-NEXT: lbu a0, 2(a0)
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: jalr zero, ra, 0
+ ; sextload i1
+ %1 = getelementptr i1, i1* %a, i32 1
+ %2 = load i1, i1* %1
+ %3 = sext i1 %2 to i16
+ ; zextload i1
+ %4 = getelementptr i1, i1* %a, i32 2
+ %5 = load i1, i1* %4
+ %6 = zext i1 %5 to i16
+ %7 = add i16 %3, %6
+ ; extload i1 (anyext). Produced as the load is unused.
+ %8 = load volatile i1, i1* %a
+ ret i16 %7
+}
+
+; Check load and store to a global
+@G = global i32 0
+
+define i32 @lw_sw_global(i32 %a) nounwind {
+; TODO: the addi should be folded in to the lw/sw operations
+; RV32I-LABEL: lw_sw_global:
+; RV32I: # BB#0:
+; RV32I-NEXT: lui a1, %hi(G)
+; RV32I-NEXT: addi a2, a1, %lo(G)
+; RV32I-NEXT: lw a1, 0(a2)
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: lui a2, %hi(G+36)
+; RV32I-NEXT: addi a2, a2, %lo(G+36)
+; RV32I-NEXT: lw a3, 0(a2)
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: addi a0, a1, 0
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = load volatile i32, i32* @G
+ store i32 %a, i32* @G
+ %2 = getelementptr i32, i32* @G, i32 9
+ %3 = load volatile i32, i32* %2
+ store i32 %a, i32* %2
+ ret i32 %1
+}
+
+; Ensure that 1 is added to the high 20 bits if bit 11 of the low part is 1
+define i32 @lw_sw_constant(i32 %a) nounwind {
+; TODO: the addi should be folded in to the lw/sw
+; RV32I-LABEL: lw_sw_constant:
+; RV32I: # BB#0:
+; RV32I-NEXT: lui a1, 912092
+; RV32I-NEXT: addi a2, a1, -273
+; RV32I-NEXT: lw a1, 0(a2)
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: addi a0, a1, 0
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = inttoptr i32 3735928559 to i32*
+ %2 = load volatile i32, i32* %1
+ store i32 %a, i32* %1
+ ret i32 %2
+}
diff --git a/test/CodeGen/RISCV/wide-mem.ll b/test/CodeGen/RISCV/wide-mem.ll
new file mode 100644
index 00000000000..18ab52aaf13
--- /dev/null
+++ b/test/CodeGen/RISCV/wide-mem.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s -check-prefix=RV32I
+
+; Check load/store operations on values wider than what is natively supported
+
+define i64 @load_i64(i64 *%a) nounwind {
+; RV32I-LABEL: load_i64:
+; RV32I: # BB#0:
+; RV32I-NEXT: lw a2, 0(a0)
+; RV32I-NEXT: lw a1, 4(a0)
+; RV32I-NEXT: addi a0, a2, 0
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = load i64, i64* %a
+ ret i64 %1
+}
+
+@val64 = local_unnamed_addr global i64 2863311530, align 8
+
+; TODO: codegen on this should be improved. It shouldn't be necessary to
+; generate two addi
+define i64 @load_i64_global() nounwind {
+; RV32I-LABEL: load_i64_global:
+; RV32I: # BB#0:
+; RV32I-NEXT: lui a0, %hi(val64)
+; RV32I-NEXT: addi a0, a0, %lo(val64)
+; RV32I-NEXT: lw a0, 0(a0)
+; RV32I-NEXT: lui a1, %hi(val64+4)
+; RV32I-NEXT: addi a1, a1, %lo(val64+4)
+; RV32I-NEXT: lw a1, 0(a1)
+; RV32I-NEXT: jalr zero, ra, 0
+ %1 = load i64, i64* @val64
+ ret i64 %1
+}
diff --git a/test/CodeGen/WebAssembly/inline-asm-m.ll b/test/CodeGen/WebAssembly/inline-asm-m.ll
new file mode 100644
index 00000000000..8d514a528fd
--- /dev/null
+++ b/test/CodeGen/WebAssembly/inline-asm-m.ll
@@ -0,0 +1,13 @@
+; RUN: not llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -no-integrated-as
+
+; Test basic inline assembly "m" operands, which are unsupported. Pass
+; -no-integrated-as since these aren't actually valid assembly syntax.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown-wasm"
+
+define void @bar(i32* %r, i32* %s) {
+entry:
+ tail call void asm sideeffect "# $0 = bbb($1)", "=*m,*m"(i32* %s, i32* %r) #0, !srcloc !1
+ ret void
+}
diff --git a/test/CodeGen/WebAssembly/inline-asm.ll b/test/CodeGen/WebAssembly/inline-asm.ll
index 56576305d9e..760b0ad0de6 100644
--- a/test/CodeGen/WebAssembly/inline-asm.ll
+++ b/test/CodeGen/WebAssembly/inline-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -no-integrated-as | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -no-integrated-as | FileCheck %s
; Test basic inline assembly. Pass -no-integrated-as since these aren't
; actually valid assembly syntax.
@@ -10,33 +10,24 @@ target triple = "wasm32-unknown-unknown-wasm"
; CHECK-NEXT: .param i32{{$}}
; CHECK-NEXT: .result i32{{$}}
; CHECK-NEXT: #APP{{$}}
-; CHECK-NEXT: # $0 = aaa($0){{$}}
+; CHECK-NEXT: # 0 = aaa(0){{$}}
; CHECK-NEXT: #NO_APP{{$}}
-; CHECK-NEXT: return $0{{$}}
+; CHECK-NEXT: get_local $push0=, 0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
define i32 @foo(i32 %r) {
entry:
%0 = tail call i32 asm sideeffect "# $0 = aaa($1)", "=r,r"(i32 %r) #0, !srcloc !0
ret i32 %0
}
-; CHECK-LABEL: bar:
-; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: #APP{{$}}
-; CHECK-NEXT: # 0($1) = bbb(0($0)){{$}}
-; CHECK-NEXT: #NO_APP{{$}}
-; CHECK-NEXT: return{{$}}
-define void @bar(i32* %r, i32* %s) {
-entry:
- tail call void asm sideeffect "# $0 = bbb($1)", "=*m,*m"(i32* %s, i32* %r) #0, !srcloc !1
- ret void
-}
-
; CHECK-LABEL: imm:
; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: .local i32{{$}}
; CHECK-NEXT: #APP{{$}}
-; CHECK-NEXT: # $0 = ccc(42){{$}}
+; CHECK-NEXT: # 0 = ccc(42){{$}}
; CHECK-NEXT: #NO_APP{{$}}
-; CHECK-NEXT: return $0{{$}}
+; CHECK-NEXT: get_local $push0=, 0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
define i32 @imm() {
entry:
%0 = tail call i32 asm sideeffect "# $0 = ccc($1)", "=r,i"(i32 42) #0, !srcloc !2
@@ -47,9 +38,10 @@ entry:
; CHECK-NEXT: .param i64{{$}}
; CHECK-NEXT: .result i64{{$}}
; CHECK-NEXT: #APP{{$}}
-; CHECK-NEXT: # $0 = aaa($0){{$}}
+; CHECK-NEXT: # 0 = aaa(0){{$}}
; CHECK-NEXT: #NO_APP{{$}}
-; CHECK-NEXT: return $0{{$}}
+; CHECK-NEXT: get_local $push0=, 0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
define i64 @foo_i64(i64 %r) {
entry:
%0 = tail call i64 asm sideeffect "# $0 = aaa($1)", "=r,r"(i64 %r) #0, !srcloc !0
@@ -57,16 +49,20 @@ entry:
}
; CHECK-LABEL: X_i16:
-; CHECK: foo $1{{$}}
-; CHECK: i32.store16 0($0), $1{{$}}
+; CHECK: foo 1{{$}}
+; CHECK: get_local $push[[S0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[S1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.store16 0($pop[[S0]]), $pop[[S1]]{{$}}
define void @X_i16(i16 * %t) {
call void asm sideeffect "foo $0", "=*X,~{dirflag},~{fpsr},~{flags},~{memory}"(i16* %t)
ret void
}
; CHECK-LABEL: X_ptr:
-; CHECK: foo $1{{$}}
-; CHECK: i32.store 0($0), $1{{$}}
+; CHECK: foo 1{{$}}
+; CHECK: get_local $push[[S0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[S1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.store 0($pop[[S0]]), $pop[[S1]]{{$}}
define void @X_ptr(i16 ** %t) {
call void asm sideeffect "foo $0", "=*X,~{dirflag},~{fpsr},~{flags},~{memory}"(i16** %t)
ret void
@@ -87,6 +83,20 @@ define void @varname() {
ret void
}
+; CHECK-LABEL: r_constraint
+; CHECK: i32.const $push[[S0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: set_local [[L0:[0-9]+]], $pop[[S0]]{{$}}
+; CHECK-NEXT: i32.const $push[[S1:[0-9]+]]=, 37{{$}}
+; CHECK-NEXT: set_local [[L1:[0-9]+]], $pop[[S1]]{{$}}
+; CHECK: foo [[L2:[0-9]+]], 1, [[L0]], [[L1]]{{$}}
+; CHECK: get_local $push{{[0-9]+}}=, [[L2]]{{$}}
+define hidden i32 @r_constraint(i32 %a, i32 %y) {
+entry:
+ %z = bitcast i32 0 to i32
+ %t0 = tail call i32 asm "foo $0, $1, $2, $3", "=r,r,r,r"(i32 %y, i32 %z, i32 37) #0, !srcloc !0
+ ret i32 %t0
+}
+
attributes #0 = { nounwind }
!0 = !{i32 47}
diff --git a/test/CodeGen/WebAssembly/signext-arg.ll b/test/CodeGen/WebAssembly/signext-arg.ll
new file mode 100644
index 00000000000..cd116c645b4
--- /dev/null
+++ b/test/CodeGen/WebAssembly/signext-arg.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=wasm32 | FileCheck %s
+
+declare i32 @get_int(i16 %arg)
+
+define i32 @func_1(i16 %arg1 , i32 %arg2) #0 {
+; CHECK-LABEL: func_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: i32.const $push1=, 16
+; CHECK-NEXT: i32.shl $push2=, $0, $pop1
+; CHECK-NEXT: i32.const $push4=, 16
+; CHECK-NEXT: i32.shr_s $push3=, $pop2, $pop4
+; CHECK-NEXT: i32.call $push0=, get_int@FUNCTION, $pop3
+; CHECK-NEXT: # fallthrough-return: $pop0
+; CHECK-NEXT: .endfunc
+entry:
+ %retval = call i32 @get_int(i16 signext %arg1)
+ ret i32 %retval
+}
+
+attributes #0 = {noinline nounwind optnone}
+
diff --git a/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll b/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll
index 6814ed1d894..109962c2859 100644
--- a/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll
+++ b/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll
@@ -23,6 +23,7 @@ lpad: ; preds = %cont, %entry
}
; CHECK: lpad
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: Ltmp
declare i32 @__gxx_personality_v0(...)
diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index 416761ffef4..dd059100503 100644
--- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -88,6 +88,7 @@ define void @full_test() {
; X32-NEXT: movss %xmm4, {{[0-9]+}}(%esp)
; X32-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: addl $60, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: full_test:
diff --git a/test/CodeGen/X86/GlobalISel/add-scalar.ll b/test/CodeGen/X86/GlobalISel/add-scalar.ll
index 64a6313023b..9d28f441fb7 100644
--- a/test/CodeGen/X86/GlobalISel/add-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/add-scalar.ll
@@ -20,6 +20,7 @@ define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
; X32-NEXT: addl 8(%ebp), %eax
; X32-NEXT: adcl 12(%ebp), %edx
; X32-NEXT: popl %ebp
+; X32-NEXT: .cfi_def_cfa %esp, 4
; X32-NEXT: retl
%ret = add i64 %arg1, %arg2
ret i64 %ret
diff --git a/test/CodeGen/X86/GlobalISel/brcond.ll b/test/CodeGen/X86/GlobalISel/brcond.ll
index 917ee6f5bd8..2467344776e 100644
--- a/test/CodeGen/X86/GlobalISel/brcond.ll
+++ b/test/CodeGen/X86/GlobalISel/brcond.ll
@@ -36,6 +36,7 @@ define i32 @test_1(i32 %a, i32 %b, i32 %tValue, i32 %fValue) {
; X32-NEXT: movl %eax, (%esp)
; X32-NEXT: movl (%esp), %eax
; X32-NEXT: popl %ecx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
%retval = alloca i32, align 4
diff --git a/test/CodeGen/X86/GlobalISel/callingconv.ll b/test/CodeGen/X86/GlobalISel/callingconv.ll
index 4100a7217ac..23987a3c365 100644
--- a/test/CodeGen/X86/GlobalISel/callingconv.ll
+++ b/test/CodeGen/X86/GlobalISel/callingconv.ll
@@ -117,6 +117,7 @@ define <8 x i32> @test_v8i32_args(<8 x i32> %arg1, <8 x i32> %arg2) {
; X32-NEXT: movups 16(%esp), %xmm1
; X32-NEXT: movaps %xmm2, %xmm0
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_v8i32_args:
@@ -135,6 +136,7 @@ define void @test_trivial_call() {
; X32-NEXT: .cfi_def_cfa_offset 16
; X32-NEXT: calll trivial_callee
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_trivial_call:
@@ -143,6 +145,7 @@ define void @test_trivial_call() {
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: callq trivial_callee
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
call void @trivial_callee()
ret void
@@ -160,6 +163,7 @@ define void @test_simple_arg_call(i32 %in0, i32 %in1) {
; X32-NEXT: movl %eax, 4(%esp)
; X32-NEXT: calll simple_arg_callee
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_simple_arg_call:
@@ -171,6 +175,7 @@ define void @test_simple_arg_call(i32 %in0, i32 %in1) {
; X64-NEXT: movl %eax, %esi
; X64-NEXT: callq simple_arg_callee
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
call void @simple_arg_callee(i32 %in1, i32 %in0)
ret void
@@ -193,6 +198,7 @@ define void @test_simple_arg8_call(i32 %in0) {
; X32-NEXT: movl %eax, 28(%esp)
; X32-NEXT: calll simple_arg8_callee
; X32-NEXT: addl $44, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_simple_arg8_call:
@@ -208,6 +214,7 @@ define void @test_simple_arg8_call(i32 %in0) {
; X64-NEXT: movl %edi, %r9d
; X64-NEXT: callq simple_arg8_callee
; X64-NEXT: addq $24, %rsp
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
call void @simple_arg8_callee(i32 %in0, i32 %in0, i32 %in0, i32 %in0,i32 %in0, i32 %in0, i32 %in0, i32 %in0)
ret void
@@ -224,6 +231,7 @@ define i32 @test_simple_return_callee() {
; X32-NEXT: calll simple_return_callee
; X32-NEXT: addl %eax, %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_simple_return_callee:
@@ -234,6 +242,7 @@ define i32 @test_simple_return_callee() {
; X64-NEXT: callq simple_return_callee
; X64-NEXT: addl %eax, %eax
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%call = call i32 @simple_return_callee(i32 5)
%r = add i32 %call, %call
@@ -254,6 +263,7 @@ define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) {
; X32-NEXT: paddd (%esp), %xmm0 # 16-byte Folded Reload
; X32-NEXT: paddd 16(%esp), %xmm1 # 16-byte Folded Reload
; X32-NEXT: addl $44, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_split_return_callee:
@@ -268,6 +278,7 @@ define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) {
; X64-NEXT: paddd (%rsp), %xmm0 # 16-byte Folded Reload
; X64-NEXT: paddd 16(%rsp), %xmm1 # 16-byte Folded Reload
; X64-NEXT: addq $40, %rsp
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%call = call <8 x i32> @split_return_callee(<8 x i32> %arg2)
%r = add <8 x i32> %arg1, %call
@@ -281,6 +292,7 @@ define void @test_indirect_call(void()* %func) {
; X32-NEXT: .cfi_def_cfa_offset 16
; X32-NEXT: calll *16(%esp)
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_indirect_call:
@@ -289,6 +301,7 @@ define void @test_indirect_call(void()* %func) {
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: callq *%rdi
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
call void %func()
ret void
@@ -317,8 +330,11 @@ define void @test_abi_exts_call(i8* %addr) {
; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: calll take_char
; X32-NEXT: addl $4, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_abi_exts_call:
@@ -335,6 +351,7 @@ define void @test_abi_exts_call(i8* %addr) {
; X64-NEXT: movl %ebx, %edi
; X64-NEXT: callq take_char
; X64-NEXT: popq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%val = load i8, i8* %addr
call void @take_char(i8 %val)
@@ -357,6 +374,7 @@ define void @test_variadic_call_1(i8** %addr_ptr, i32* %val_ptr) {
; X32-NEXT: movl %ecx, 4(%esp)
; X32-NEXT: calll variadic_callee
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_variadic_call_1:
@@ -368,6 +386,7 @@ define void @test_variadic_call_1(i8** %addr_ptr, i32* %val_ptr) {
; X64-NEXT: movb $0, %al
; X64-NEXT: callq variadic_callee
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%addr = load i8*, i8** %addr_ptr
@@ -393,6 +412,7 @@ define void @test_variadic_call_2(i8** %addr_ptr, double* %val_ptr) {
; X32-NEXT: movl %ecx, 4(%eax)
; X32-NEXT: calll variadic_callee
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_variadic_call_2:
@@ -405,6 +425,7 @@ define void @test_variadic_call_2(i8** %addr_ptr, double* %val_ptr) {
; X64-NEXT: movq %rcx, %xmm0
; X64-NEXT: callq variadic_callee
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%addr = load i8*, i8** %addr_ptr
diff --git a/test/CodeGen/X86/GlobalISel/frameIndex.ll b/test/CodeGen/X86/GlobalISel/frameIndex.ll
index 7b2a050f153..f260d0d707f 100644
--- a/test/CodeGen/X86/GlobalISel/frameIndex.ll
+++ b/test/CodeGen/X86/GlobalISel/frameIndex.ll
@@ -18,6 +18,7 @@ define i32* @allocai32() {
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movl %esp, %eax
; X32-NEXT: popl %ecx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X32ABI-LABEL: allocai32:
diff --git a/test/CodeGen/X86/GlobalISel/select-cmp.mir b/test/CodeGen/X86/GlobalISel/select-cmp.mir
index 9058f010f76..3457e971b8d 100644
--- a/test/CodeGen/X86/GlobalISel/select-cmp.mir
+++ b/test/CodeGen/X86/GlobalISel/select-cmp.mir
@@ -100,7 +100,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr8 = COPY %sil
; CHECK: CMP8rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -131,7 +131,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr16 = COPY %si
; CHECK: CMP16rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -162,7 +162,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr64 = COPY %rsi
; CHECK: CMP64rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -193,7 +193,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -224,7 +224,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETNEr:%[0-9]+]]:gr8 = SETNEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETNEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETNEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -255,7 +255,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETAr:%[0-9]+]]:gr8 = SETAr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -286,7 +286,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETAEr:%[0-9]+]]:gr8 = SETAEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -317,7 +317,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETBr:%[0-9]+]]:gr8 = SETBr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -348,7 +348,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETBEr:%[0-9]+]]:gr8 = SETBEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -379,7 +379,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETGr:%[0-9]+]]:gr8 = SETGr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -410,7 +410,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETGEr:%[0-9]+]]:gr8 = SETGEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -441,7 +441,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETLr:%[0-9]+]]:gr8 = SETLr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -472,7 +472,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETLEr:%[0-9]+]]:gr8 = SETLEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
diff --git a/test/CodeGen/X86/GlobalISel/select-copy.mir b/test/CodeGen/X86/GlobalISel/select-copy.mir
index a72f42782c0..fccba1f8206 100644
--- a/test/CodeGen/X86/GlobalISel/select-copy.mir
+++ b/test/CodeGen/X86/GlobalISel/select-copy.mir
@@ -42,7 +42,7 @@ registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
# ALL: %0:gr8 = COPY %al
-# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
# ALL-NEXT: %1:gr32 = AND32ri8 %2, 1, implicit-def %eflags
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
@@ -146,7 +146,7 @@ regBankSelected: true
registers:
- { id: 0, class: gpr, preferred-register: '' }
# ALL: %0:gr8 = COPY %dl
-# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
body: |
@@ -170,7 +170,7 @@ regBankSelected: true
registers:
- { id: 0, class: gpr, preferred-register: '' }
# ALL: %0:gr16 = COPY %dx
-# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, 3
+# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_16bit
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
body: |
diff --git a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
index 51088e126e5..9df24f65b36 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
@@ -39,7 +39,7 @@ body: |
; ALL-LABEL: name: test_zext_i1
; ALL: [[COPY:%[0-9]+]]:gr8 = COPY %dil
; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]]
- ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 1
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_8bit
; ALL: [[AND64ri8_:%[0-9]+]]:gr64 = AND64ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; ALL: %rax = COPY [[AND64ri8_]]
; ALL: RET 0, implicit %rax
@@ -112,7 +112,7 @@ body: |
; ALL-LABEL: name: anyext_s64_from_s1
; ALL: [[COPY:%[0-9]+]]:gr64_with_sub_8bit = COPY %rdi
; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
- ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 1
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_8bit
; ALL: %rax = COPY [[SUBREG_TO_REG]]
; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
@@ -137,7 +137,7 @@ body: |
; ALL-LABEL: name: anyext_s64_from_s8
; ALL: [[COPY:%[0-9]+]]:gr64_with_sub_8bit = COPY %rdi
; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
- ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 1
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_8bit
; ALL: %rax = COPY [[SUBREG_TO_REG]]
; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
@@ -162,7 +162,7 @@ body: |
; ALL-LABEL: name: anyext_s64_from_s16
; ALL: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
; ALL: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
- ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 3
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_16bit
; ALL: %rax = COPY [[SUBREG_TO_REG]]
; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
@@ -187,7 +187,7 @@ body: |
; ALL-LABEL: name: anyext_s64_from_s32
; ALL: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
; ALL: [[COPY1:%[0-9]+]]:gr32 = COPY [[COPY]].sub_32bit
- ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 4
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_32bit
; ALL: %rax = COPY [[SUBREG_TO_REG]]
; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
diff --git a/test/CodeGen/X86/GlobalISel/select-ext.mir b/test/CodeGen/X86/GlobalISel/select-ext.mir
index 5167ee987a5..90ac0c6763a 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext.mir
@@ -85,7 +85,7 @@ registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
# ALL: %0:gr8 = COPY %dil
-# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
# ALL-NEXT: %1:gr16 = AND16ri8 %2, 1, implicit-def %eflags
# ALL-NEXT: %ax = COPY %1
# ALL-NEXT: RET 0, implicit %ax
@@ -113,7 +113,7 @@ registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
# ALL: %0:gr8 = COPY %dil
-# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
# ALL-NEXT: %1:gr32 = AND32ri8 %2, 1, implicit-def %eflags
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
@@ -288,7 +288,7 @@ registers:
# X32: %0:gr32_abcd = COPY %edi
# X64: %0:gr32 = COPY %edi
# ALL-NEXT: %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, 1
+# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, %subreg.sub_8bit
# ALL-NEXT: %ax = COPY %2
# ALL-NEXT: RET 0, implicit %ax
body: |
@@ -323,7 +323,7 @@ registers:
# X32: %0:gr32_abcd = COPY %edi
# X64: %0:gr32 = COPY %edi
# ALL-NEXT: %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, 1
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, %subreg.sub_8bit
# ALL-NEXT: %eax = COPY %2
# ALL-NEXT: RET 0, implicit %eax
body: |
@@ -358,7 +358,7 @@ registers:
# X32: %0:gr32_abcd = COPY %edi
# X64: %0:gr32 = COPY %edi
# ALL-NEXT: %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, 1
+# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, %subreg.sub_8bit
# ALL-NEXT: %ax = COPY %2
# ALL-NEXT: RET 0, implicit %ax
body: |
@@ -422,7 +422,7 @@ registers:
- { id: 2, class: gpr }
# ALL: %0:gr32 = COPY %edi
# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit
-# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, 3
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, %subreg.sub_16bit
# ALL-NEXT: %eax = COPY %2
# ALL-NEXT: RET 0, implicit %eax
body: |
diff --git a/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir b/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir
index 596c48b4922..628ab3bac4a 100644
--- a/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir
+++ b/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir
@@ -20,7 +20,7 @@ body: |
bb.0:
; CHECK-LABEL: name: read_flags
; CHECK: [[RDFLAGS32_:%[0-9]+]]:gr32 = RDFLAGS32 implicit-def %esp, implicit %esp
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[RDFLAGS32_]], 4
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[RDFLAGS32_]], %subreg.sub_32bit
; CHECK: %rax = COPY [[SUBREG_TO_REG]]
%0(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.x86.flags.read.u32)
%rax = COPY %0(s32)
diff --git a/test/CodeGen/X86/O0-pipeline.ll b/test/CodeGen/X86/O0-pipeline.ll
index 1f7415ee2af..8ecafad8022 100644
--- a/test/CodeGen/X86/O0-pipeline.ll
+++ b/test/CodeGen/X86/O0-pipeline.ll
@@ -49,6 +49,7 @@
; CHECK-NEXT: X86 pseudo instruction expansion pass
; CHECK-NEXT: Analyze Machine Code For Garbage Collection
; CHECK-NEXT: X86 vzeroupper inserter
+; CHECK-NEXT: Check CFA info and insert CFI instructions if needed
; CHECK-NEXT: Contiguously Lay Out Funclets
; CHECK-NEXT: StackMap Liveness Analysis
; CHECK-NEXT: Live DEBUG_VALUE analysis
diff --git a/test/CodeGen/X86/TruncAssertZext.ll b/test/CodeGen/X86/TruncAssertZext.ll
index b9ae57ca011..ed98fd51cc0 100644
--- a/test/CodeGen/X86/TruncAssertZext.ll
+++ b/test/CodeGen/X86/TruncAssertZext.ll
@@ -25,6 +25,7 @@ define i64 @main() {
; CHECK-NEXT: subq %rcx, %rax
; CHECK-NEXT: shrq $32, %rax
; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%b = call i64 @foo()
%or = and i64 %b, 18446744069414584575 ; this is 0xffffffff000000ff
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index 508f10e9889..14494779f10 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -2209,62 +2209,53 @@ define void @avg_v16i8_const(<16 x i8>* %a) nounwind {
define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
; SSE2-LABEL: avg_v32i8_const:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm5
-; SSE2-NEXT: movdqa 16(%rdi), %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; SSE2-NEXT: movdqa %xmm2, %xmm8
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [5,6,7,8]
-; SSE2-NEXT: paddd %xmm9, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,3,4]
-; SSE2-NEXT: paddd %xmm3, %xmm7
-; SSE2-NEXT: paddd %xmm9, %xmm6
-; SSE2-NEXT: paddd %xmm3, %xmm4
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: movdqa 16(%rdi), %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,2,3,4]
+; SSE2-NEXT: paddd %xmm9, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,6,7,8]
+; SSE2-NEXT: paddd %xmm4, %xmm8
; SSE2-NEXT: paddd %xmm9, %xmm2
-; SSE2-NEXT: paddd %xmm3, %xmm8
+; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: paddd %xmm9, %xmm3
+; SSE2-NEXT: paddd %xmm4, %xmm6
; SSE2-NEXT: paddd %xmm9, %xmm1
-; SSE2-NEXT: paddd %xmm3, %xmm0
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: paddd %xmm4, %xmm7
+; SSE2-NEXT: psrld $1, %xmm7
; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm8
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm4
+; SSE2-NEXT: packuswb %xmm7, %xmm1
; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: psrld $1, %xmm7
+; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: packuswb %xmm6, %xmm3
+; SSE2-NEXT: packuswb %xmm3, %xmm1
; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pand %xmm3, %xmm7
-; SSE2-NEXT: packuswb %xmm5, %xmm7
-; SSE2-NEXT: pand %xmm3, %xmm6
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: packuswb %xmm6, %xmm4
-; SSE2-NEXT: packuswb %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm3, %xmm8
-; SSE2-NEXT: packuswb %xmm2, %xmm8
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: packuswb %xmm5, %xmm2
+; SSE2-NEXT: psrld $1, %xmm8
+; SSE2-NEXT: psrld $1, %xmm0
; SSE2-NEXT: packuswb %xmm8, %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: movdqu %xmm4, (%rax)
+; SSE2-NEXT: packuswb %xmm0, %xmm2
+; SSE2-NEXT: movdqu %xmm1, (%rax)
+; SSE2-NEXT: movdqu %xmm2, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i8_const:
@@ -2277,9 +2268,9 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [5,6,7,8]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,2,3,4]
; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm9
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,2,3,4]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8]
; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm5
; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
@@ -2287,30 +2278,21 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm8
+; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5
-; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6
-; AVX1-NEXT: vpsrld $1, %xmm9, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm1, %xmm7, %xmm7
-; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm6
-; AVX1-NEXT: vpackuswb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm5
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1
-; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2
+; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $1, %xmm4, %xmm1
+; AVX1-NEXT: vpsrld $1, %xmm5, %xmm2
+; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrld $1, %xmm6, %xmm2
+; AVX1-NEXT: vpsrld $1, %xmm9, %xmm3
+; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -2567,49 +2549,40 @@ define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6
; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5
; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4
-; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm9
+; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3
; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $1, %ymm0, %ymm10
+; AVX2-NEXT: vpsrld $1, %ymm0, %ymm8
; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
-; AVX2-NEXT: vpsrld $1, %ymm2, %ymm3
-; AVX2-NEXT: vpsrld $1, %ymm9, %ymm8
+; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
+; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
; AVX2-NEXT: vpsrld $1, %ymm6, %ymm6
-; AVX2-NEXT: vpsrld $1, %ymm7, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7
-; AVX2-NEXT: vpackssdw %xmm7, %xmm2, %xmm7
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm7
-; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm0
-; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
+; AVX2-NEXT: vpsrld $1, %ymm7, %ymm7
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm0
+; AVX2-NEXT: vpackssdw %xmm0, %xmm7, %xmm0
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX2-NEXT: vpackssdw %xmm7, %xmm6, %xmm6
+; AVX2-NEXT: vpackuswb %xmm0, %xmm6, %xmm0
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
-; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
-; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; AVX2-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm8, %xmm4
-; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm10, %xmm4
-; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm3
+; AVX2-NEXT: vpackssdw %xmm3, %xmm8, %xmm3
+; AVX2-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 923e1b9b0e0..dc386415934 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -12,7 +12,6 @@ define void @zero128() nounwind ssp {
; CHECK-NEXT: movq _z@{{.*}}(%rip), %rax
; CHECK-NEXT: vmovaps %xmm0, (%rax)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
store <4 x float> zeroinitializer, <4 x float>* @z, align 16
ret void
}
@@ -27,7 +26,6 @@ define void @zero256() nounwind ssp {
; CHECK-NEXT: vmovaps %ymm0, (%rax)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
store <8 x float> zeroinitializer, <8 x float>* @x, align 32
store <4 x double> zeroinitializer, <4 x double>* @y, align 32
ret void
@@ -41,7 +39,6 @@ define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nou
; CHECK-NEXT: vmovaps %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
allocas:
%ptr2vec615 = bitcast [0 x float]* %RET to <8 x float>*
store <8 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float
@@ -59,7 +56,6 @@ define void @ones2([0 x i32]* nocapture %RET, [0 x i32]* nocapture %aFOO) nounwi
; CHECK-NEXT: vmovaps %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
allocas:
%ptr2vec615 = bitcast [0 x i32]* %RET to <8 x i32>*
store <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32>* %ptr2vec615, align 32
@@ -83,7 +79,6 @@ define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind {
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%ptrcast.i33.i = bitcast [0 x float]* %aFOO to i32*
%val.i34.i = load i32, i32* %ptrcast.i33.i, align 4
%ptroffset.i22.i992 = getelementptr [0 x float], [0 x float]* %aFOO, i64 0, i64 1
@@ -102,7 +97,6 @@ define <16 x float> @fneg(<16 x float> %a) nounwind {
; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vxorps %ymm2, %ymm1, %ymm1
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
ret <16 x float> %1
}
@@ -114,7 +108,6 @@ define <16 x i16> @build_vec_16x16(i16 %a) nounwind readonly {
; CHECK-NEXT: movzwl %di, %eax
; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%res = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %a, i32 0
ret <16 x i16> %res
}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index 44eb14160ee..e508e345de6 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -581,15 +581,10 @@ declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
define <8 x float> @test_x86_avx_rcp_ps_256(<8 x float> %a0) {
-; AVX-LABEL: test_x86_avx_rcp_ps_256:
-; AVX: # BB#0:
-; AVX-NEXT: vrcpps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x53,0xc0]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_rcp_ps_256:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vrcp14ps %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x4c,0xc0]
-; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_rcp_ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vrcpps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x53,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -619,15 +614,10 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno
define <8 x float> @test_x86_avx_rsqrt_ps_256(<8 x float> %a0) {
-; AVX-LABEL: test_x86_avx_rsqrt_ps_256:
-; AVX: # BB#0:
-; AVX-NEXT: vrsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x52,0xc0]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_rsqrt_ps_256:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vrsqrt14ps %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x4e,0xc0]
-; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_rsqrt_ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vrsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x52,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -635,10 +625,15 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_sqrt_pd_256:
-; CHECK: # BB#0:
-; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0]
-; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX: # BB#0:
+; AVX-NEXT: vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vsqrtpd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x51,0xc0]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -646,10 +641,15 @@ declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_sqrt_ps_256:
-; CHECK: # BB#0:
-; CHECK-NEXT: vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0]
-; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX: # BB#0:
+; AVX-NEXT: vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vsqrtps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x51,0xc0]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll
index 44d13db65c9..858a27b1d48 100644
--- a/test/CodeGen/X86/avx-schedule.ll
+++ b/test/CodeGen/X86/avx-schedule.ll
@@ -3982,8 +3982,8 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
;
; SKX-LABEL: test_rcpps:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
-; SKX-NEXT: vrcp14ps (%rdi), %ymm1 # sched: [11:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpps (%rdi), %ymm1 # sched: [11:1.00]
; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
@@ -4174,8 +4174,8 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
;
; SKX-LABEL: test_rsqrtps:
; SKX: # BB#0:
-; SKX-NEXT: vrsqrt14ps %ymm0, %ymm0 # sched: [4:1.00]
-; SKX-NEXT: vrsqrt14ps (%rdi), %ymm1 # sched: [11:1.00]
+; SKX-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [11:1.00]
; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index b75bd8cc3ee..909e8398680 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -699,11 +699,13 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; AVX512BW-NEXT: jg LBB17_1
; AVX512BW-NEXT: ## BB#2:
; AVX512BW-NEXT: vpcmpltud %zmm2, %zmm1, %k0
-; AVX512BW-NEXT: jmp LBB17_3
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB17_1:
-; AVX512BW-NEXT: vpcmpgtd %zmm2, %zmm0, %k0
-; AVX512BW-NEXT: LBB17_3:
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: vpcmpgtd %zmm2, %zmm0, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-regcall-Mask.ll b/test/CodeGen/X86/avx512-regcall-Mask.ll
index bb541f46567..fa6adec675f 100644
--- a/test/CodeGen/X86/avx512-regcall-Mask.ll
+++ b/test/CodeGen/X86/avx512-regcall-Mask.ll
@@ -209,12 +209,18 @@ define i64 @caller_argv64i1() #0 {
; LINUXOSX64-NEXT: pushq %rax
; LINUXOSX64-NEXT: .cfi_adjust_cfa_offset 8
; LINUXOSX64-NEXT: callq test_argv64i1
-; LINUXOSX64-NEXT: addq $24, %rsp
+; LINUXOSX64-NEXT: addq $16, %rsp
; LINUXOSX64-NEXT: .cfi_adjust_cfa_offset -16
+; LINUXOSX64-NEXT: addq $8, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 40
; LINUXOSX64-NEXT: popq %r12
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 32
; LINUXOSX64-NEXT: popq %r13
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 24
; LINUXOSX64-NEXT: popq %r14
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %r15
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i64 4294967298 to <64 x i1>
@@ -287,6 +293,7 @@ define <64 x i1> @caller_retv64i1() #0 {
; LINUXOSX64-NEXT: kmovq %rax, %k0
; LINUXOSX64-NEXT: vpmovm2b %k0, %zmm0
; LINUXOSX64-NEXT: popq %rax
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <64 x i1> @test_retv64i1()
@@ -397,7 +404,9 @@ define x86_regcallcc i32 @test_argv32i1(<32 x i1> %x0, <32 x i1> %x1, <32 x i1>
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
; LINUXOSX64-NEXT: addq $128, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: vzeroupper
; LINUXOSX64-NEXT: retq
entry:
@@ -451,6 +460,7 @@ define i32 @caller_argv32i1() #0 {
; LINUXOSX64-NEXT: movl $1, %edx
; LINUXOSX64-NEXT: callq test_argv32i1
; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i32 1 to <32 x i1>
@@ -513,6 +523,7 @@ define i32 @caller_retv32i1() #0 {
; LINUXOSX64-NEXT: callq test_retv32i1
; LINUXOSX64-NEXT: incl %eax
; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <32 x i1> @test_retv32i1()
@@ -626,7 +637,9 @@ define x86_regcallcc i16 @test_argv16i1(<16 x i1> %x0, <16 x i1> %x1, <16 x i1>
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
; LINUXOSX64-NEXT: addq $128, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%res = call i16 @test_argv16i1helper(<16 x i1> %x0, <16 x i1> %x1, <16 x i1> %x2)
ret i16 %res
@@ -678,6 +691,7 @@ define i16 @caller_argv16i1() #0 {
; LINUXOSX64-NEXT: movl $1, %edx
; LINUXOSX64-NEXT: callq test_argv16i1
; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i16 1 to <16 x i1>
@@ -746,6 +760,7 @@ define i16 @caller_retv16i1() #0 {
; LINUXOSX64-NEXT: incl %eax
; LINUXOSX64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <16 x i1> @test_retv16i1()
@@ -859,7 +874,9 @@ define x86_regcallcc i8 @test_argv8i1(<8 x i1> %x0, <8 x i1> %x1, <8 x i1> %x2)
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
; LINUXOSX64-NEXT: addq $128, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%res = call i8 @test_argv8i1helper(<8 x i1> %x0, <8 x i1> %x1, <8 x i1> %x2)
ret i8 %res
@@ -911,6 +928,7 @@ define i8 @caller_argv8i1() #0 {
; LINUXOSX64-NEXT: movl $1, %edx
; LINUXOSX64-NEXT: callq test_argv8i1
; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i8 1 to <8 x i1>
@@ -984,9 +1002,11 @@ define <8 x i1> @caller_retv8i1() #0 {
; LINUXOSX64-NEXT: vpmovm2w %k0, %zmm0
; LINUXOSX64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; LINUXOSX64-NEXT: popq %rax
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: vzeroupper
; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <8 x i1> @test_retv8i1()
ret <8 x i1> %call
}
+
diff --git a/test/CodeGen/X86/avx512-regcall-NoMask.ll b/test/CodeGen/X86/avx512-regcall-NoMask.ll
index 43a1871245b..b4f1d2c776d 100644
--- a/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -63,6 +63,7 @@ define x86_regcallcc i1 @test_CallargReti1(i1 %a) {
; LINUXOSX64-NEXT: callq test_argReti1
; LINUXOSX64-NEXT: incb %al
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = add i1 %a, 1
%c = call x86_regcallcc i1 @test_argReti1(i1 %b)
@@ -130,6 +131,7 @@ define x86_regcallcc i8 @test_CallargReti8(i8 %a) {
; LINUXOSX64-NEXT: callq test_argReti8
; LINUXOSX64-NEXT: incb %al
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = add i8 %a, 1
%c = call x86_regcallcc i8 @test_argReti8(i8 %b)
@@ -200,6 +202,7 @@ define x86_regcallcc i16 @test_CallargReti16(i16 %a) {
; LINUXOSX64-NEXT: incl %eax
; LINUXOSX64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = add i16 %a, 1
%c = call x86_regcallcc i16 @test_argReti16(i16 %b)
@@ -261,6 +264,7 @@ define x86_regcallcc i32 @test_CallargReti32(i32 %a) {
; LINUXOSX64-NEXT: callq test_argReti32
; LINUXOSX64-NEXT: incl %eax
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = add i32 %a, 1
%c = call x86_regcallcc i32 @test_argReti32(i32 %b)
@@ -327,6 +331,7 @@ define x86_regcallcc i64 @test_CallargReti64(i64 %a) {
; LINUXOSX64-NEXT: callq test_argReti64
; LINUXOSX64-NEXT: incq %rax
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = add i64 %a, 1
%c = call x86_regcallcc i64 @test_argReti64(i64 %b)
@@ -406,7 +411,9 @@ define x86_regcallcc float @test_CallargRetFloat(float %a) {
; LINUXOSX64-NEXT: vaddss %xmm8, %xmm0, %xmm0
; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
; LINUXOSX64-NEXT: addq $16, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = fadd float 1.0, %a
%c = call x86_regcallcc float @test_argRetFloat(float %b)
@@ -486,7 +493,9 @@ define x86_regcallcc double @test_CallargRetDouble(double %a) {
; LINUXOSX64-NEXT: vaddsd %xmm8, %xmm0, %xmm0
; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
; LINUXOSX64-NEXT: addq $16, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = fadd double 1.0, %a
%c = call x86_regcallcc double @test_argRetDouble(double %b)
@@ -548,6 +557,7 @@ define x86_regcallcc x86_fp80 @test_CallargRetf80(x86_fp80 %a) {
; LINUXOSX64-NEXT: callq test_argRetf80
; LINUXOSX64-NEXT: fadd %st(0), %st(0)
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = fadd x86_fp80 %a, %a
%c = call x86_regcallcc x86_fp80 @test_argRetf80(x86_fp80 %b)
@@ -611,6 +621,7 @@ define x86_regcallcc [4 x i32]* @test_CallargRetPointer([4 x i32]* %a) {
; LINUXOSX64-NEXT: callq test_argRetPointer
; LINUXOSX64-NEXT: incl %eax
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = ptrtoint [4 x i32]* %a to i32
%c = add i32 %b, 1
@@ -694,7 +705,9 @@ define x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i32> %a) {
; LINUXOSX64-NEXT: vmovdqa32 %xmm8, %xmm0 {%k1}
; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
; LINUXOSX64-NEXT: addq $16, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = call x86_regcallcc <4 x i32> @test_argRet128Vector(<4 x i32> %a, <4 x i32> %a)
%c = select <4 x i1> undef , <4 x i32> %a, <4 x i32> %b
@@ -768,7 +781,9 @@ define x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i32> %a) {
; LINUXOSX64-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
; LINUXOSX64-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
; LINUXOSX64-NEXT: addq $48, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = call x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i32> %a, <8 x i32> %a)
%c = select <8 x i1> undef , <8 x i32> %a, <8 x i32> %b
@@ -842,7 +857,9 @@ define x86_regcallcc <16 x i32> @test_CallargRet512Vector(<16 x i32> %a) {
; LINUXOSX64-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
; LINUXOSX64-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; LINUXOSX64-NEXT: addq $112, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = call x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i32> %a, <16 x i32> %a)
%c = select <16 x i1> undef , <16 x i32> %a, <16 x i32> %b
diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll
index 8372fbdb9ab..abc8c1a7513 100755
--- a/test/CodeGen/X86/avx512-schedule.ll
+++ b/test/CodeGen/X86/avx512-schedule.ll
@@ -8839,6 +8839,7 @@ define <16 x float> @broadcast_ss_spill(float %x) {
; GENERIC-NEXT: callq func_f32
; GENERIC-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload
; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33]
+; GENERIC-NEXT: .cfi_def_cfa_offset 8
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: broadcast_ss_spill:
@@ -8852,6 +8853,7 @@ define <16 x float> @broadcast_ss_spill(float %x) {
; SKX-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50]
; SKX-NEXT: # sched: [8:0.50]
; SKX-NEXT: addq $24, %rsp # sched: [1:0.25]
+; SKX-NEXT: .cfi_def_cfa_offset 8
; SKX-NEXT: retq # sched: [7:1.00]
%a = fadd float %x, %x
call void @func_f32(float %a)
@@ -8872,6 +8874,7 @@ define <8 x double> @broadcast_sd_spill(double %x) {
; GENERIC-NEXT: callq func_f64
; GENERIC-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload
; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33]
+; GENERIC-NEXT: .cfi_def_cfa_offset 8
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: broadcast_sd_spill:
@@ -8885,6 +8888,7 @@ define <8 x double> @broadcast_sd_spill(double %x) {
; SKX-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50]
; SKX-NEXT: # sched: [8:0.50]
; SKX-NEXT: addq $24, %rsp # sched: [1:0.25]
+; SKX-NEXT: .cfi_def_cfa_offset 8
; SKX-NEXT: retq # sched: [7:1.00]
%a = fadd double %x, %x
call void @func_f64(double %a)
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index 43cf9ee7358..51a7c685ed4 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -115,6 +115,7 @@ define <16 x double> @select04(<16 x double> %a, <16 x double> %b) {
; X86-NEXT: vmovaps 8(%ebp), %zmm1
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
; X86-NEXT: retl
;
; X64-LABEL: select04:
diff --git a/test/CodeGen/X86/avx512-shuffle-schedule.ll b/test/CodeGen/X86/avx512-shuffle-schedule.ll
index c59fb5b97bc..c95f0d40fbf 100755
--- a/test/CodeGen/X86/avx512-shuffle-schedule.ll
+++ b/test/CodeGen/X86/avx512-shuffle-schedule.ll
@@ -9533,18 +9533,18 @@ define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %ve
define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9555,18 +9555,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x flo
define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9576,18 +9574,18 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8
define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9598,18 +9596,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x flo
define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9619,18 +9615,18 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8
define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9641,18 +9637,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x flo
define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9675,18 +9669,18 @@ define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %ve
define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9697,18 +9691,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x flo
define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9732,18 +9724,18 @@ define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9755,18 +9747,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9778,18 +9768,18 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9801,18 +9791,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9824,18 +9812,18 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9847,18 +9835,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9884,18 +9870,18 @@ define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9907,18 +9893,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -10337,18 +10321,18 @@ define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10359,18 +10343,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10380,18 +10362,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10402,18 +10384,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10423,18 +10403,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10445,18 +10425,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10479,18 +10457,18 @@ define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10501,18 +10479,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10536,18 +10512,18 @@ define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10559,18 +10535,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10582,18 +10556,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10605,18 +10579,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10628,18 +10600,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10651,18 +10623,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10688,18 +10658,18 @@ define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10711,18 +10681,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -11128,12 +11096,12 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec
define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
; GENERIC-LABEL: test_8xi32_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
ret <8 x i32> %res
@@ -11141,18 +11109,18 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11163,18 +11131,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11184,18 +11150,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11206,18 +11172,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11227,18 +11191,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11249,18 +11213,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11270,12 +11232,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
; GENERIC-LABEL: test_8xi32_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
ret <8 x i32> %res
@@ -11283,18 +11245,18 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11305,18 +11267,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11326,12 +11286,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
; GENERIC-LABEL: test_8xi32_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11340,18 +11300,18 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11363,18 +11323,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11386,18 +11344,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11409,18 +11367,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11432,18 +11388,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11455,18 +11411,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11478,12 +11432,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
; GENERIC-LABEL: test_8xi32_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11492,18 +11446,18 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11515,18 +11469,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11932,12 +11884,12 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16
define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
; GENERIC-LABEL: test_4xi64_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
ret <4 x i64> %res
@@ -11945,18 +11897,18 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -11967,18 +11919,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -11988,18 +11938,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12010,18 +11960,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12031,18 +11979,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12053,18 +12001,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12074,12 +12020,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
; GENERIC-LABEL: test_4xi64_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
ret <4 x i64> %res
@@ -12087,18 +12033,18 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12109,18 +12055,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12130,12 +12074,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
; GENERIC-LABEL: test_4xi64_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12144,18 +12088,18 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12167,18 +12111,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12190,18 +12132,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12213,18 +12155,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12236,18 +12176,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12259,18 +12199,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12282,12 +12220,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
; GENERIC-LABEL: test_4xi64_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12296,18 +12234,18 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12319,18 +12257,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
diff --git a/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll b/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
index c957a85a885..799bbc11bee 100644
--- a/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
+++ b/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
@@ -14,10 +14,10 @@ define <8 x float> @test_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec
define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -28,10 +28,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x floa
define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -41,10 +40,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -55,10 +54,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x floa
define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -68,10 +66,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -82,10 +80,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x floa
define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -103,10 +100,10 @@ define <8 x float> @test_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec
define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -117,10 +114,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x floa
define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -139,10 +135,10 @@ define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -154,10 +150,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -169,10 +164,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -184,10 +179,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -199,10 +193,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -214,10 +208,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -238,10 +231,10 @@ define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -253,10 +246,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -530,10 +522,10 @@ define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -544,10 +536,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -557,10 +548,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -571,10 +562,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -584,10 +574,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -598,10 +588,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -619,10 +608,10 @@ define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -633,10 +622,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -655,10 +643,10 @@ define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -670,10 +658,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -685,10 +672,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -700,10 +687,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -715,10 +701,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -730,10 +716,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -754,10 +739,10 @@ define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -769,10 +754,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1038,7 +1022,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec
define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
; CHECK-LABEL: test_8xi32_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
ret <8 x i32> %res
@@ -1046,10 +1030,10 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1060,10 +1044,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1073,10 +1056,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1087,10 +1070,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1100,10 +1082,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1114,10 +1096,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1127,7 +1108,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
; CHECK-LABEL: test_8xi32_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
ret <8 x i32> %res
@@ -1135,10 +1116,10 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1149,10 +1130,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1162,7 +1142,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
; CHECK-LABEL: test_8xi32_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1171,10 +1151,10 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1186,10 +1166,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1201,10 +1180,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1216,10 +1195,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1231,10 +1209,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1246,10 +1224,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1261,7 +1238,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
; CHECK-LABEL: test_8xi32_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1270,10 +1247,10 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1285,10 +1262,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1554,7 +1530,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16
define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
; CHECK-LABEL: test_4xi64_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
ret <4 x i64> %res
@@ -1562,10 +1538,10 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1576,10 +1552,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1589,10 +1564,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1603,10 +1578,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1616,10 +1590,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1630,10 +1604,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1643,7 +1616,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
; CHECK-LABEL: test_4xi64_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
ret <4 x i64> %res
@@ -1651,10 +1624,10 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1665,10 +1638,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1678,7 +1650,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
; CHECK-LABEL: test_4xi64_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1687,10 +1659,10 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1702,10 +1674,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1717,10 +1688,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1732,10 +1703,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1747,10 +1717,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1762,10 +1732,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1777,7 +1746,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
; CHECK-LABEL: test_4xi64_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1786,10 +1755,10 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1801,10 +1770,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
index 23d66457994..ff25c005e9c 100644
--- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -46,8 +46,6 @@ define <8 x i1> @test3(<4 x i1> %a) {
; CHECK: # BB#0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
-; CHECK-NEXT: kshiftlb $4, %k0, %k0
-; CHECK-NEXT: kshiftrb $4, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 584968f1c6e..9aacb23fbd5 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -413,6 +413,7 @@ define <16 x float> @broadcast_ss_spill(float %x) {
; ALL-NEXT: callq func_f32
; ALL-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload
; ALL-NEXT: addq $24, %rsp
+; ALL-NEXT: .cfi_def_cfa_offset 8
; ALL-NEXT: retq
%a = fadd float %x, %x
call void @func_f32(float %a)
@@ -432,6 +433,7 @@ define <8 x double> @broadcast_sd_spill(double %x) {
; ALL-NEXT: callq func_f64
; ALL-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload
; ALL-NEXT: addq $24, %rsp
+; ALL-NEXT: .cfi_def_cfa_offset 8
; ALL-NEXT: retq
%a = fadd double %x, %x
call void @func_f64(double %a)
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
index d1bf8fd5f3f..7f170cd51bf 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
@@ -717,6 +717,7 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: vpbroadcastb %eax, %zmm3 {%k1}
; X32-NEXT: vmovdqa64 %zmm3, %zmm0
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_set1_epi8:
@@ -1444,6 +1445,7 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: korq %k0, %k1, %k1
; X32-NEXT: vpbroadcastb %eax, %zmm0 {%k1} {z}
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_set1_epi8:
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index a5ef1809157..87565ac129b 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -355,6 +355,7 @@ define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
@@ -380,6 +381,7 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
@@ -445,6 +447,7 @@ define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
@@ -470,6 +473,7 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
@@ -1702,6 +1706,7 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -2503,8 +2508,11 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: addl %esi, %eax
; AVX512F-32-NEXT: adcl %ecx, %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 12
; AVX512F-32-NEXT: popl %esi
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
; AVX512F-32-NEXT: popl %ebx
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
@@ -2586,6 +2594,7 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -3387,8 +3396,11 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: addl %esi, %eax
; AVX512F-32-NEXT: adcl %ecx, %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 12
; AVX512F-32-NEXT: popl %esi
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
; AVX512F-32-NEXT: popl %ebx
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index e23deebd15b..c2620642e5c 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1499,6 +1499,7 @@ define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
ret i64 %res
@@ -1522,6 +1523,7 @@ define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8> %x0)
ret i64 %res
@@ -1712,6 +1714,7 @@ define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $20, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
%res1 = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
@@ -1776,6 +1779,7 @@ define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $20, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
%res1 = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
diff --git a/test/CodeGen/X86/avx512bw-vec-test-testn.ll b/test/CodeGen/X86/avx512bw-vec-test-testn.ll
index 6dd6440faa1..82d0b8846de 100644
--- a/test/CodeGen/X86/avx512bw-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512bw-vec-test-testn.ll
@@ -5,9 +5,7 @@
define zeroext i32 @TEST_mm512_test_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqw %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestmw %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -24,9 +22,7 @@ entry:
define zeroext i64 @TEST_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqb %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestmb %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovq %k0, %rax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -42,10 +38,8 @@ entry:
define zeroext i32 @TEST_mm512_mask_test_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmw %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -63,10 +57,8 @@ entry:
define zeroext i64 @TEST_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovq %rdi, %k1
-; CHECK-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmb %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovq %k0, %rax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -84,9 +76,7 @@ entry:
define zeroext i32 @TEST_mm512_testn_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestnmw %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -103,9 +93,7 @@ entry:
define zeroext i64 @TEST_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestnmb %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovq %k0, %rax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -121,10 +109,8 @@ entry:
define zeroext i32 @TEST_mm512_mask_testn_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmw %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -142,10 +128,8 @@ entry:
define zeroext i64 @TEST_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovq %rdi, %k1
-; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmb %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovq %k0, %rax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll b/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
index f67ceb2fe04..44075deb1d9 100644
--- a/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
@@ -5,9 +5,7 @@
define zeroext i16 @TEST_mm_test_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k0
+; CHECK-NEXT: vptestmb %xmm0, %xmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -23,10 +21,8 @@ entry:
define zeroext i16 @TEST_mm_mask_test_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_mask_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmb %xmm0, %xmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -44,9 +40,7 @@ entry:
define zeroext i8 @TEST_mm_test_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k0
+; CHECK-NEXT: vptestmw %xmm0, %xmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -62,10 +56,8 @@ entry:
define zeroext i8 @TEST_mm_mask_test_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_mask_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmw %xmm0, %xmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -83,9 +75,7 @@ entry:
define zeroext i16 @TEST_mm_testn_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
+; CHECK-NEXT: vptestnmb %xmm0, %xmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -101,10 +91,8 @@ entry:
define zeroext i16 @TEST_mm_mask_testn_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_mask_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmb %xmm0, %xmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -122,9 +110,7 @@ entry:
define zeroext i8 @TEST_mm_testn_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
+; CHECK-NEXT: vptestnmw %xmm0, %xmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -140,10 +126,8 @@ entry:
define zeroext i8 @TEST_mm_mask_testn_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_mask_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmw %xmm0, %xmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -161,9 +145,7 @@ entry:
define i32 @TEST_mm256_test_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0
+; CHECK-NEXT: vptestmb %ymm0, %ymm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -179,10 +161,8 @@ entry:
define i32 @TEST_mm256_mask_test_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_mask_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1}
+; CHECK-NEXT: vptestmb %ymm0, %ymm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -200,9 +180,7 @@ entry:
define zeroext i16 @TEST_mm256_test_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k0
+; CHECK-NEXT: vptestmw %ymm0, %ymm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -219,10 +197,8 @@ entry:
define zeroext i16 @TEST_mm256_mask_test_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_mask_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1}
+; CHECK-NEXT: vptestmw %ymm0, %ymm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -241,9 +217,7 @@ entry:
define i32 @TEST_mm256_testn_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
+; CHECK-NEXT: vptestnmb %ymm0, %ymm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -259,10 +233,8 @@ entry:
define i32 @TEST_mm256_mask_testn_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_mask_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmb %ymm0, %ymm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -280,9 +252,7 @@ entry:
define zeroext i16 @TEST_mm256_testn_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
+; CHECK-NEXT: vptestnmw %ymm0, %ymm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -299,10 +269,8 @@ entry:
define zeroext i16 @TEST_mm256_mask_testn_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_mask_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmw %ymm0, %ymm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll
new file mode 100644
index 00000000000..ca5e5523a9d
--- /dev/null
+++ b/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512cd | FileCheck %s
+
+define <8 x i64> @test_mm512_broadcastmb_epi64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm512_broadcastmb_epi64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = icmp eq <8 x i64> %a, %b
+ %1 = bitcast <8 x i1> %0 to i8
+ %conv.i = zext i8 %1 to i64
+ %vecinit.i.i = insertelement <8 x i64> undef, i64 %conv.i, i32 0
+ %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
+ ret <8 x i64> %vecinit7.i.i
+}
+
+define <8 x i64> @test_mm512_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm512_broadcastmw_epi32:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %a to <16 x i32>
+ %1 = bitcast <8 x i64> %b to <16 x i32>
+ %2 = icmp eq <16 x i32> %0, %1
+ %3 = bitcast <16 x i1> %2 to i16
+ %conv.i = zext i16 %3 to i32
+ %vecinit.i.i = insertelement <16 x i32> undef, i32 %conv.i, i32 0
+ %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
+ %4 = bitcast <16 x i32> %vecinit15.i.i to <8 x i64>
+ ret <8 x i64> %4
+}
+
+
diff --git a/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll
index e5dbff9ac51..92dfe1e087a 100644
--- a/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll
@@ -45,3 +45,26 @@ define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
%res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
ret <8 x i64> %res
}
+
+define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) {
+; CHECK-LABEL: test_x86_vbroadcastmw_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: vpbroadcastd %eax, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0)
+ ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16)
+
+define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) {
+; CHECK-LABEL: test_x86_broadcastmb_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: vpbroadcastq %rax, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0)
+ ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8)
+
diff --git a/test/CodeGen/X86/avx512cd-intrinsics.ll b/test/CodeGen/X86/avx512cd-intrinsics.ll
index 7e5a3e8fe25..ab8c80f8dd3 100644
--- a/test/CodeGen/X86/avx512cd-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cd-intrinsics.ll
@@ -1,28 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s
-define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) {
-; CHECK-LABEL: test_x86_vbroadcastmw_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmw2d %k0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0)
- ret <16 x i32> %res
-}
-declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16)
-
-define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) {
-; CHECK-LABEL: test_x86_broadcastmb_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmb2q %k0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0)
- ret <8 x i64> %res
-}
-declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8)
-
declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
define <8 x i64> @test_conflict_q(<8 x i64> %a) {
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
index f8f47c87100..0e310be3489 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
@@ -69,3 +69,47 @@ define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64>
ret <4 x i64> %res2
}
+define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) {
+; CHECK-LABEL: test_x86_vbroadcastmw_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: vpbroadcastd %eax, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ;
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16)
+
+define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) {
+; CHECK-LABEL: test_x86_vbroadcastmw_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: vpbroadcastd %eax, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16)
+
+define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) {
+; CHECK-LABEL: test_x86_broadcastmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: vpbroadcastq %rax, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ;
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8)
+
+define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) {
+; CHECK-LABEL: test_x86_broadcastmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: vpbroadcastq %rax, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8)
+
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
index 96254f7c95b..2fb50297c62 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
@@ -147,46 +147,3 @@ define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i
ret <4 x i64> %res2
}
-define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) {
-; CHECK-LABEL: test_x86_vbroadcastmw_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ;
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16)
-
-define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) {
-; CHECK-LABEL: test_x86_vbroadcastmw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16)
-
-define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) {
-; CHECK-LABEL: test_x86_broadcastmb_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ;
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8)
-
-define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) {
-; CHECK-LABEL: test_x86_broadcastmb_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8)
diff --git a/test/CodeGen/X86/avx512f-vec-test-testn.ll b/test/CodeGen/X86/avx512f-vec-test-testn.ll
index c9c0c2251a4..e9cdacc354f 100644
--- a/test/CodeGen/X86/avx512f-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512f-vec-test-testn.ll
@@ -5,9 +5,7 @@
define zeroext i8 @TEST_mm512_test_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_test_epi64_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -23,9 +21,7 @@ entry:
define zeroext i16 @TEST_mm512_test_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_test_epi32_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestmd %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -42,10 +38,8 @@ entry:
define zeroext i8 @TEST_mm512_mask_test_epi64_mask(i8 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_test_epi64_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -63,10 +57,8 @@ entry:
define zeroext i16 @TEST_mm512_mask_test_epi32_mask(i16 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_test_epi32_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -85,9 +77,7 @@ entry:
define zeroext i8 @TEST_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_testn_epi64_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -103,9 +93,7 @@ entry:
define zeroext i16 @TEST_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_testn_epi32_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -122,10 +110,8 @@ entry:
define zeroext i8 @TEST_mm512_mask_testn_epi64_mask(i8 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_testn_epi64_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -143,10 +129,8 @@ entry:
define zeroext i16 @TEST_mm512_mask_testn_epi32_mask(i16 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_testn_epi32_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
index f5578d6cc88..3f4a696af0c 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
@@ -233,6 +233,7 @@ define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64>
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastd_epi32:
@@ -265,6 +266,7 @@ define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastd_epi32:
@@ -369,6 +371,7 @@ define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64>
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastq_epi64:
@@ -398,6 +401,7 @@ define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastq_epi64:
@@ -441,6 +445,7 @@ define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_broadcastq_epi64:
@@ -470,6 +475,7 @@ define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
@@ -513,6 +519,7 @@ define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastsd_pd:
@@ -542,6 +549,7 @@ define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastsd_pd:
@@ -585,6 +593,7 @@ define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_broadcastsd_pd:
@@ -614,6 +623,7 @@ define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
@@ -657,6 +667,7 @@ define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x fl
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastss_ps:
@@ -686,6 +697,7 @@ define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastss_ps:
@@ -781,6 +793,7 @@ define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x doub
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_movddup_pd:
@@ -810,6 +823,7 @@ define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_movddup_pd:
@@ -853,6 +867,7 @@ define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x d
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_movddup_pd:
@@ -882,6 +897,7 @@ define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_movddup_pd:
@@ -925,6 +941,7 @@ define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_movehdup_ps:
@@ -954,6 +971,7 @@ define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_movehdup_ps:
@@ -1049,6 +1067,7 @@ define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_moveldup_ps:
@@ -1078,6 +1097,7 @@ define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_moveldup_ps:
@@ -1173,6 +1193,7 @@ define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_permutex_epi64:
@@ -1202,6 +1223,7 @@ define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_permutex_epi64:
@@ -1245,6 +1267,7 @@ define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_permutex_pd:
@@ -1274,6 +1297,7 @@ define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_permutex_pd:
@@ -1317,6 +1341,7 @@ define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x doub
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_shuffle_pd:
@@ -1346,6 +1371,7 @@ define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x dou
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_shuffle_pd:
@@ -1389,6 +1415,7 @@ define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x d
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_shuffle_pd:
@@ -1418,6 +1445,7 @@ define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_shuffle_pd:
@@ -1461,6 +1489,7 @@ define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float>
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_shuffle_ps:
@@ -1490,6 +1519,7 @@ define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_shuffle_ps:
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index b6723ee50b0..6c6fad794c8 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -4712,8 +4712,8 @@ declare <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32>, <8 x i32>, i32,
define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x03,0xd9,0x06]
-; CHECK-NEXT: ## ymm3 = ymm1[6,7],ymm0[0,1,2,3,4,5]
+; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03]
+; CHECK-NEXT: ## ymm3 = ymm1[3],ymm0[0,1,2]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x06]
; CHECK-NEXT: ## ymm2 {%k1} = ymm1[6,7],ymm0[0,1,2,3,4,5]
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index 9098ca30897..35fecf8955c 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -2729,8 +2729,8 @@ define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x
; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1,2,3],ymm1[4,5,6,7]
; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xd1,0x16]
; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc1,0x16]
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: vperm2f128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x30]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0]
; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2752,7 +2752,7 @@ define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4
; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xd1,0x16]
; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc1,0x16]
+; CHECK-NEXT: vperm2f128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x30]
; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0]
; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
@@ -2773,8 +2773,8 @@ define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xd1,0x16]
; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc1,0x16]
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: vperm2i128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x46,0xc1,0x30]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
@@ -2791,7 +2791,7 @@ define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xd1,0x16]
; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc1,0x16]
+; CHECK-NEXT: vperm2i128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x46,0xc1,0x30]
; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
diff --git a/test/CodeGen/X86/avx512vl-vbroadcast.ll b/test/CodeGen/X86/avx512vl-vbroadcast.ll
index 9fc957297e2..1098e7bffe0 100644
--- a/test/CodeGen/X86/avx512vl-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512vl-vbroadcast.ll
@@ -12,6 +12,7 @@ define <8 x float> @_256_broadcast_ss_spill(float %x) {
; CHECK-NEXT: callq func_f32
; CHECK-NEXT: vbroadcastss (%rsp), %ymm0 # 16-byte Folded Reload
; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%a = fadd float %x, %x
call void @func_f32(float %a)
@@ -30,6 +31,7 @@ define <4 x float> @_128_broadcast_ss_spill(float %x) {
; CHECK-NEXT: callq func_f32
; CHECK-NEXT: vbroadcastss (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%a = fadd float %x, %x
call void @func_f32(float %a)
@@ -49,6 +51,7 @@ define <4 x double> @_256_broadcast_sd_spill(double %x) {
; CHECK-NEXT: callq func_f64
; CHECK-NEXT: vbroadcastsd (%rsp), %ymm0 # 16-byte Folded Reload
; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%a = fadd double %x, %x
call void @func_f64(double %a)
diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
index 5ee06fde127..bccf953fb0b 100644
--- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
@@ -109,6 +109,7 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -227,6 +228,7 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -348,6 +350,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -470,6 +473,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -597,6 +601,7 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -720,6 +725,7 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -846,6 +852,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -973,6 +980,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1024,6 +1032,7 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1071,6 +1080,7 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1129,6 +1139,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1188,6 +1199,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1217,8 +1229,6 @@ define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -1246,8 +1256,6 @@ define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -1278,8 +1286,6 @@ define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -1311,8 +1317,6 @@ define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -1392,6 +1396,7 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1465,6 +1470,7 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1541,6 +1547,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1618,6 +1625,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1700,6 +1708,7 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1778,6 +1787,7 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1859,6 +1869,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1941,6 +1952,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2064,6 +2076,7 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2183,6 +2196,7 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2305,6 +2319,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2428,6 +2443,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2556,6 +2572,7 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2680,6 +2697,7 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2807,6 +2825,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2935,6 +2954,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -3288,6 +3308,7 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -3552,6 +3573,7 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -3912,6 +3934,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -4188,6 +4211,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5051,6 +5075,7 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5092,6 +5117,7 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5153,6 +5179,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5216,6 +5243,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5263,6 +5291,7 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5326,6 +5355,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5379,6 +5409,7 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5426,6 +5457,7 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5493,6 +5525,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5562,6 +5595,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5615,6 +5649,7 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5684,6 +5719,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5957,6 +5993,7 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6030,6 +6067,7 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6106,6 +6144,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6183,6 +6222,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6260,6 +6300,7 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6337,6 +6378,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6420,6 +6462,7 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6498,6 +6541,7 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6579,6 +6623,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6661,6 +6706,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6743,6 +6789,7 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6825,6 +6872,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6946,6 +6994,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7062,6 +7111,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7181,6 +7231,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7301,6 +7352,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7421,6 +7473,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7541,6 +7594,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7667,6 +7721,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7788,6 +7843,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7912,6 +7968,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -8037,6 +8094,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -8162,6 +8220,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -8287,6 +8346,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9131,6 +9191,7 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9172,6 +9233,7 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9225,6 +9287,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9280,6 +9343,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9327,6 +9391,7 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9382,6 +9447,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9435,6 +9501,7 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9482,6 +9549,7 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9541,6 +9609,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9602,6 +9671,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9655,6 +9725,7 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9716,6 +9787,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10607,6 +10679,7 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10650,6 +10723,7 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10713,6 +10787,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10778,6 +10853,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10827,6 +10903,7 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10892,6 +10969,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10947,6 +11025,7 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10996,6 +11075,7 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11065,6 +11145,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11136,6 +11217,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11191,6 +11273,7 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11262,6 +11345,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11509,6 +11593,7 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11580,6 +11665,7 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11654,6 +11740,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11729,6 +11816,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11804,6 +11892,7 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11879,6 +11968,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11960,6 +12050,7 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12036,6 +12127,7 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12115,6 +12207,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12195,6 +12288,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12275,6 +12369,7 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12355,6 +12450,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12478,6 +12574,7 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12596,6 +12693,7 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12717,6 +12815,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12839,6 +12938,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12966,6 +13066,7 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13089,6 +13190,7 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13215,6 +13317,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13342,6 +13445,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13393,6 +13497,7 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13440,6 +13545,7 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13498,6 +13604,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13557,6 +13664,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13586,8 +13694,6 @@ define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -13615,8 +13721,6 @@ define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -13647,8 +13751,6 @@ define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -13680,8 +13782,6 @@ define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -13761,6 +13861,7 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13834,6 +13935,7 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13910,6 +14012,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13987,6 +14090,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14069,6 +14173,7 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14147,6 +14252,7 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14228,6 +14334,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14310,6 +14417,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14433,6 +14541,7 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14552,6 +14661,7 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14674,6 +14784,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14797,6 +14908,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14925,6 +15037,7 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -15049,6 +15162,7 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -15176,6 +15290,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -15304,6 +15419,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -15657,6 +15773,7 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -15921,6 +16038,7 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -16281,6 +16399,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -16557,6 +16676,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17420,6 +17540,7 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17461,6 +17582,7 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17522,6 +17644,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17585,6 +17708,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17632,6 +17756,7 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17695,6 +17820,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17748,6 +17874,7 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17795,6 +17922,7 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17862,6 +17990,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17931,6 +18060,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17984,6 +18114,7 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18053,6 +18184,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18326,6 +18458,7 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18399,6 +18532,7 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18475,6 +18609,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18552,6 +18687,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18629,6 +18765,7 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18706,6 +18843,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18789,6 +18927,7 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18867,6 +19006,7 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18948,6 +19088,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19030,6 +19171,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19112,6 +19254,7 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19194,6 +19337,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19315,6 +19459,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19431,6 +19576,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19550,6 +19696,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19670,6 +19817,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19790,6 +19938,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19910,6 +20059,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20036,6 +20186,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20157,6 +20308,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20281,6 +20433,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20406,6 +20559,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20531,6 +20685,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20656,6 +20811,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21500,6 +21656,7 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21541,6 +21698,7 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21594,6 +21752,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21649,6 +21808,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21696,6 +21856,7 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21751,6 +21912,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21804,6 +21966,7 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21851,6 +22014,7 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21910,6 +22074,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21971,6 +22136,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -22024,6 +22190,7 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -22085,6 +22252,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -22976,6 +23144,7 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23019,6 +23188,7 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23082,6 +23252,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23147,6 +23318,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23196,6 +23368,7 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23261,6 +23434,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23316,6 +23490,7 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23365,6 +23540,7 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23434,6 +23610,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23505,6 +23682,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23560,6 +23738,7 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23631,6 +23810,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23878,6 +24058,7 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23949,6 +24130,7 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24023,6 +24205,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24098,6 +24281,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24173,6 +24357,7 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24248,6 +24433,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24329,6 +24515,7 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24405,6 +24592,7 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24484,6 +24672,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24564,6 +24753,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24644,6 +24834,7 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24724,6 +24915,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24849,6 +25041,7 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24970,6 +25163,7 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25093,6 +25287,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25218,6 +25413,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25347,6 +25543,7 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25473,6 +25670,7 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25601,6 +25799,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25731,6 +25930,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25784,6 +25984,7 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25834,6 +26035,7 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25894,6 +26096,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25956,6 +26159,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25987,8 +26191,6 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -26019,8 +26221,6 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -26053,8 +26253,6 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -26089,8 +26287,6 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -26172,6 +26368,7 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26248,6 +26445,7 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26326,6 +26524,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26406,6 +26605,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26490,6 +26690,7 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26571,6 +26772,7 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26654,6 +26856,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26739,6 +26942,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26864,6 +27068,7 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26986,6 +27191,7 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27110,6 +27316,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27236,6 +27443,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27366,6 +27574,7 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27493,6 +27702,7 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27622,6 +27832,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27753,6 +27964,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -28109,6 +28321,7 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -28378,6 +28591,7 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -28741,6 +28955,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -29022,6 +29237,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -29903,6 +30119,7 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -29947,6 +30164,7 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30008,6 +30226,7 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30072,6 +30291,7 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30121,6 +30341,7 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30184,6 +30405,7 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30239,6 +30461,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30289,6 +30512,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30356,6 +30580,7 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30426,6 +30651,7 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30481,6 +30707,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30550,6 +30777,7 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30823,6 +31051,7 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30896,6 +31125,7 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30972,6 +31202,7 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31049,6 +31280,7 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31126,6 +31358,7 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31203,6 +31436,7 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31286,6 +31520,7 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31364,6 +31599,7 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31445,6 +31681,7 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31527,6 +31764,7 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31609,6 +31847,7 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31691,6 +31930,7 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31812,6 +32052,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31928,6 +32169,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32047,6 +32289,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32167,6 +32410,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32287,6 +32531,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32407,6 +32652,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32533,6 +32779,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32654,6 +32901,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32778,6 +33026,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32903,6 +33152,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -33028,6 +33278,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -33153,6 +33404,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34023,6 +34275,7 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34067,6 +34320,7 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34120,6 +34374,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34176,6 +34431,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34225,6 +34481,7 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34280,6 +34537,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34335,6 +34593,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34385,6 +34644,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34444,6 +34704,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34506,6 +34767,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34561,6 +34823,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34622,6 +34885,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35543,6 +35807,7 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35589,6 +35854,7 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35654,6 +35920,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35722,6 +35989,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35773,6 +36041,7 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35840,6 +36109,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35897,6 +36167,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35949,6 +36220,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36020,6 +36292,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36094,6 +36367,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36151,6 +36425,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36224,6 +36499,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36471,6 +36747,7 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36542,6 +36819,7 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36616,6 +36894,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36691,6 +36970,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36766,6 +37046,7 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36841,6 +37122,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36922,6 +37204,7 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36998,6 +37281,7 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37077,6 +37361,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37157,6 +37442,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37237,6 +37523,7 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37317,6 +37604,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37443,6 +37731,7 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37564,6 +37853,7 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37688,6 +37978,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37813,6 +38104,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37943,6 +38235,7 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38069,6 +38362,7 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38198,6 +38492,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38328,6 +38623,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38382,6 +38678,7 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38432,6 +38729,7 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38493,6 +38791,7 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38555,6 +38854,7 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38587,8 +38887,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -38619,8 +38917,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -38654,8 +38950,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -38690,8 +38984,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -38774,6 +39066,7 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38850,6 +39143,7 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38929,6 +39223,7 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39009,6 +39304,7 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39094,6 +39390,7 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39175,6 +39472,7 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39259,6 +39557,7 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39344,6 +39643,7 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39470,6 +39770,7 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39592,6 +39893,7 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39717,6 +40019,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39843,6 +40146,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39974,6 +40278,7 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -40101,6 +40406,7 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -40231,6 +40537,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -40362,6 +40669,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -40720,6 +41028,7 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -40989,6 +41298,7 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -41354,6 +41664,7 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -41635,6 +41946,7 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42537,6 +42849,7 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42581,6 +42894,7 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42645,6 +42959,7 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42711,6 +43026,7 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42761,6 +43077,7 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42827,6 +43144,7 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42883,6 +43201,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42933,6 +43252,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43003,6 +43323,7 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43075,6 +43396,7 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43131,6 +43453,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43203,6 +43526,7 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43476,6 +43800,7 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43549,6 +43874,7 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43625,6 +43951,7 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43702,6 +44029,7 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43779,6 +44107,7 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43856,6 +44185,7 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43939,6 +44269,7 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44017,6 +44348,7 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44098,6 +44430,7 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44180,6 +44513,7 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44262,6 +44596,7 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44344,6 +44679,7 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44465,6 +44801,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44581,6 +44918,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44700,6 +45038,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44820,6 +45159,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44940,6 +45280,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45060,6 +45401,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45186,6 +45528,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45307,6 +45650,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45431,6 +45775,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45556,6 +45901,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45681,6 +46027,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45806,6 +46153,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46707,6 +47055,7 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46751,6 +47100,7 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46807,6 +47157,7 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46865,6 +47216,7 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46915,6 +47267,7 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46973,6 +47326,7 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47029,6 +47383,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47079,6 +47434,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47141,6 +47497,7 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47205,6 +47562,7 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47261,6 +47619,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47325,6 +47684,7 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48255,6 +48615,7 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48301,6 +48662,7 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48367,6 +48729,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48435,6 +48798,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48487,6 +48851,7 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48555,6 +48920,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48613,6 +48979,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48665,6 +49032,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48737,6 +49105,7 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48811,6 +49180,7 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48869,6 +49239,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48943,6 +49314,7 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49190,6 +49562,7 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49261,6 +49634,7 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49335,6 +49709,7 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49410,6 +49785,7 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49485,6 +49861,7 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49560,6 +49937,7 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49641,6 +50019,7 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49717,6 +50096,7 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49796,6 +50176,7 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49876,6 +50257,7 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49956,6 +50338,7 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -50036,6 +50419,7 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -50829,6 +51213,7 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -50870,6 +51255,7 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -50913,6 +51299,7 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -50964,6 +51351,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51015,6 +51403,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51068,6 +51457,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51121,6 +51511,7 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51168,6 +51559,7 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51217,6 +51609,7 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51274,6 +51667,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51331,6 +51725,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51390,6 +51785,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51663,6 +52059,7 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51736,6 +52133,7 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51810,6 +52208,7 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51887,6 +52286,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51964,6 +52364,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52042,6 +52443,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52126,6 +52528,7 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52204,6 +52607,7 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52283,6 +52687,7 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52365,6 +52770,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52447,6 +52853,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52530,6 +52937,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52652,6 +53060,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52768,6 +53177,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52885,6 +53295,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53005,6 +53416,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53125,6 +53537,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53246,6 +53659,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53414,6 +53828,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53535,6 +53950,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53657,6 +54073,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53782,6 +54199,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53907,6 +54325,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -54033,6 +54452,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -54886,6 +55306,7 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -54927,6 +55348,7 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -54970,6 +55392,7 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55020,6 +55443,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask(i2 zeroext %__u, <2 x
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55070,6 +55494,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem(i2 zeroext %__u, <
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55122,6 +55547,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u,
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55175,6 +55601,7 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55222,6 +55649,7 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55271,6 +55699,7 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55327,6 +55756,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55383,6 +55813,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55441,6 +55872,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56260,6 +56692,7 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56303,6 +56736,7 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56348,6 +56782,7 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56401,6 +56836,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56454,6 +56890,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56509,6 +56946,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56564,6 +57002,7 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56613,6 +57052,7 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56664,6 +57104,7 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56723,6 +57164,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56782,6 +57224,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56843,6 +57286,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57146,6 +57590,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57217,6 +57662,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57289,6 +57735,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57364,6 +57811,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57439,6 +57887,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57515,6 +57964,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57647,6 +58097,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57723,6 +58174,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57800,6 +58252,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57880,6 +58333,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57960,6 +58414,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -58041,6 +58496,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/avx512vl-vec-test-testn.ll b/test/CodeGen/X86/avx512vl-vec-test-testn.ll
index f1919cb118c..32de0254efa 100644
--- a/test/CodeGen/X86/avx512vl-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512vl-vec-test-testn.ll
@@ -6,18 +6,14 @@
define zeroext i8 @TEST_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_test_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpneqq %xmm1, %xmm0, %k0
+; X86_64-NEXT: vptestmq %xmm0, %xmm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_test_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpneqq %xmm1, %xmm0, %k0
+; I386-NEXT: vptestmq %xmm0, %xmm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -33,18 +29,14 @@ entry:
define zeroext i8 @TEST_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_test_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpneqd %xmm1, %xmm0, %k0
+; X86_64-NEXT: vptestmd %xmm0, %xmm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_test_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpneqd %xmm1, %xmm0, %k0
+; I386-NEXT: vptestmd %xmm0, %xmm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -61,9 +53,7 @@ entry:
define zeroext i8 @TEST_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_test_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpneqq %ymm1, %ymm0, %k0
+; X86_64-NEXT: vptestmq %ymm0, %ymm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -71,9 +61,7 @@ define zeroext i8 @TEST_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) lo
;
; I386-LABEL: TEST_mm256_test_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpneqq %ymm1, %ymm0, %k0
+; I386-NEXT: vptestmq %ymm0, %ymm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -90,9 +78,7 @@ entry:
define zeroext i8 @TEST_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_test_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpneqd %ymm1, %ymm0, %k0
+; X86_64-NEXT: vptestmd %ymm0, %ymm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -100,9 +86,7 @@ define zeroext i8 @TEST_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) lo
;
; I386-LABEL: TEST_mm256_test_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpneqd %ymm1, %ymm0, %k0
+; I386-NEXT: vptestmd %ymm0, %ymm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -119,21 +103,17 @@ entry:
define zeroext i8 @TEST_mm_mask_test_epi64_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_mask_test_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpneqq %xmm1, %xmm0, %k0 {%k1}
+; X86_64-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_mask_test_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpneqq %xmm1, %xmm0, %k0 {%k1}
+; I386-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -152,21 +132,17 @@ entry:
define zeroext i8 @TEST_mm_mask_test_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_mask_test_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpneqd %xmm1, %xmm0, %k0 {%k1}
+; X86_64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_mask_test_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpneqd %xmm1, %xmm0, %k0 {%k1}
+; I386-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -187,10 +163,8 @@ entry:
define zeroext i8 @TEST_mm256_mask_test_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_mask_test_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpneqq %ymm1, %ymm0, %k0 {%k1}
+; X86_64-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -198,11 +172,9 @@ define zeroext i8 @TEST_mm256_mask_test_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x
;
; I386-LABEL: TEST_mm256_mask_test_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpneqq %ymm1, %ymm0, %k0 {%k1}
+; I386-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -222,10 +194,8 @@ entry:
define zeroext i8 @TEST_mm256_mask_test_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_mask_test_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 {%k1}
+; X86_64-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -233,11 +203,9 @@ define zeroext i8 @TEST_mm256_mask_test_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x
;
; I386-LABEL: TEST_mm256_mask_test_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 {%k1}
+; I386-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -256,18 +224,14 @@ entry:
define zeroext i8 @TEST_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_testn_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; X86_64-NEXT: vptestnmq %xmm0, %xmm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_testn_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; I386-NEXT: vptestnmq %xmm0, %xmm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -283,18 +247,14 @@ entry:
define zeroext i8 @TEST_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_testn_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; X86_64-NEXT: vptestnmd %xmm0, %xmm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_testn_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; I386-NEXT: vptestnmd %xmm0, %xmm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -311,9 +271,7 @@ entry:
define zeroext i8 @TEST_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_testn_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; X86_64-NEXT: vptestnmq %ymm0, %ymm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -321,9 +279,7 @@ define zeroext i8 @TEST_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) l
;
; I386-LABEL: TEST_mm256_testn_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; I386-NEXT: vptestnmq %ymm0, %ymm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -340,9 +296,7 @@ entry:
define zeroext i8 @TEST_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_testn_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; X86_64-NEXT: vptestnmd %ymm0, %ymm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -350,9 +304,7 @@ define zeroext i8 @TEST_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) l
;
; I386-LABEL: TEST_mm256_testn_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; I386-NEXT: vptestnmd %ymm0, %ymm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -369,21 +321,17 @@ entry:
define zeroext i8 @TEST_mm_mask_testn_epi64_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_mask_testn_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; X86_64-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_mask_testn_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; I386-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -402,21 +350,17 @@ entry:
define zeroext i8 @TEST_mm_mask_testn_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_mask_testn_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; X86_64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_mask_testn_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; I386-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -437,10 +381,8 @@ entry:
define zeroext i8 @TEST_mm256_mask_testn_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_mask_testn_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; X86_64-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -448,11 +390,9 @@ define zeroext i8 @TEST_mm256_mask_testn_epi64_mask(i8 %__U, <4 x i64> %__A, <4
;
; I386-LABEL: TEST_mm256_mask_testn_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; I386-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -472,10 +412,8 @@ entry:
define zeroext i8 @TEST_mm256_mask_testn_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_mask_testn_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
+; X86_64-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -483,11 +421,9 @@ define zeroext i8 @TEST_mm256_mask_testn_epi32_mask(i8 %__U, <4 x i64> %__A, <4
;
; I386-LABEL: TEST_mm256_mask_testn_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
+; I386-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll
new file mode 100644
index 00000000000..ab4cbeb8d5e
--- /dev/null
+++ b/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s
+
+define <2 x i64> @test_mm_broadcastmb_epi64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_mm_broadcastmb_epi64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %a to <4 x i32>
+ %1 = bitcast <2 x i64> %b to <4 x i32>
+ %2 = icmp eq <4 x i32> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <8 x i1> %3 to i8
+ %conv.i = zext i8 %4 to i64
+ %vecinit.i.i = insertelement <2 x i64> undef, i64 %conv.i, i32 0
+ %vecinit1.i.i = shufflevector <2 x i64> %vecinit.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
+ ret <2 x i64> %vecinit1.i.i
+}
+
+define <4 x i64> @test_mm256_broadcastmb_epi64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_mm256_broadcastmb_epi64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0
+; CHECK-NEXT: retq
+entry:
+ %0 = icmp eq <4 x i64> %a, %b
+ %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = bitcast <8 x i1> %1 to i8
+ %conv.i = zext i8 %2 to i64
+ %vecinit.i.i = insertelement <4 x i64> undef, i64 %conv.i, i32 0
+ %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
+ ret <4 x i64> %vecinit3.i.i
+}
+
+define <2 x i64> @test_mm_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm_broadcastmw_epi32:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %a to <16 x i32>
+ %1 = bitcast <8 x i64> %b to <16 x i32>
+ %2 = icmp eq <16 x i32> %0, %1
+ %3 = bitcast <16 x i1> %2 to i16
+ %conv.i = zext i16 %3 to i32
+ %vecinit.i.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0
+ %vecinit3.i.i = shufflevector <4 x i32> %vecinit.i.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %4 = bitcast <4 x i32> %vecinit3.i.i to <2 x i64>
+ ret <2 x i64> %4
+}
+
+define <4 x i64> @test_mm256_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm256_broadcastmw_epi32:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %a to <16 x i32>
+ %1 = bitcast <8 x i64> %b to <16 x i32>
+ %2 = icmp eq <16 x i32> %0, %1
+ %3 = bitcast <16 x i1> %2 to i16
+ %conv.i = zext i16 %3 to i32
+ %vecinit.i.i = insertelement <8 x i32> undef, i32 %conv.i, i32 0
+ %vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %4 = bitcast <8 x i32> %vecinit7.i.i to <4 x i64>
+ ret <4 x i64> %4
+}
+
+
diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll
index e197713c679..c48222000c6 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -439,6 +439,7 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
diff --git a/test/CodeGen/X86/bitcast-and-setcc-512.ll b/test/CodeGen/X86/bitcast-and-setcc-512.ll
index f6cfbbb4044..f5fe395eaf3 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-512.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-512.ll
@@ -594,6 +594,7 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -1239,6 +1240,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; AVX1-NEXT: orq %rcx, %rax
; AVX1-NEXT: movq %rbp, %rsp
; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: .cfi_def_cfa %rsp, 8
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1457,6 +1459,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; AVX2-NEXT: orq %rcx, %rax
; AVX2-NEXT: movq %rbp, %rsp
; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: .cfi_def_cfa %rsp, 8
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1499,6 +1502,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index 4ed55ac0919..1959000b859 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -321,11 +321,17 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) {
; AVX512-NEXT: vpinsrb $15, %r9d, %xmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: .cfi_def_cfa_offset 48
; AVX512-NEXT: popq %r12
+; AVX512-NEXT: .cfi_def_cfa_offset 40
; AVX512-NEXT: popq %r13
+; AVX512-NEXT: .cfi_def_cfa_offset 32
; AVX512-NEXT: popq %r14
+; AVX512-NEXT: .cfi_def_cfa_offset 24
; AVX512-NEXT: popq %r15
+; AVX512-NEXT: .cfi_def_cfa_offset 16
; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: .cfi_def_cfa_offset 8
; AVX512-NEXT: retq
%1 = bitcast i16 %a0 to <16 x i1>
%2 = zext <16 x i1> %1 to <16 x i8>
diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll
index ee2dac1d466..76160517546 100644
--- a/test/CodeGen/X86/bitcast-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-setcc-256.ll
@@ -204,6 +204,7 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
diff --git a/test/CodeGen/X86/bitcast-setcc-512.ll b/test/CodeGen/X86/bitcast-setcc-512.ll
index 2b73c6e16bd..ef981080bb3 100644
--- a/test/CodeGen/X86/bitcast-setcc-512.ll
+++ b/test/CodeGen/X86/bitcast-setcc-512.ll
@@ -203,6 +203,7 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) {
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -769,6 +770,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: orq %rcx, %rax
; AVX1-NEXT: movq %rbp, %rsp
; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: .cfi_def_cfa %rsp, 8
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -983,6 +985,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
; AVX2-NEXT: orq %rcx, %rax
; AVX2-NEXT: movq %rbp, %rsp
; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: .cfi_def_cfa %rsp, 8
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1021,6 +1024,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
diff --git a/test/CodeGen/X86/bool-vector.ll b/test/CodeGen/X86/bool-vector.ll
index eb40744c54d..692d992df76 100644
--- a/test/CodeGen/X86/bool-vector.ll
+++ b/test/CodeGen/X86/bool-vector.ll
@@ -93,6 +93,7 @@ define i32 @PR15215_good(<4 x i32> %input) {
; X32-NEXT: leal (%eax,%edx,4), %eax
; X32-NEXT: leal (%eax,%esi,8), %eax
; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X32-SSE2-LABEL: PR15215_good:
@@ -115,6 +116,7 @@ define i32 @PR15215_good(<4 x i32> %input) {
; X32-SSE2-NEXT: leal (%eax,%edx,4), %eax
; X32-SSE2-NEXT: leal (%eax,%esi,8), %eax
; X32-SSE2-NEXT: popl %esi
+; X32-SSE2-NEXT: .cfi_def_cfa_offset 4
; X32-SSE2-NEXT: retl
;
; X32-AVX2-LABEL: PR15215_good:
@@ -134,6 +136,7 @@ define i32 @PR15215_good(<4 x i32> %input) {
; X32-AVX2-NEXT: leal (%eax,%edx,4), %eax
; X32-AVX2-NEXT: leal (%eax,%esi,8), %eax
; X32-AVX2-NEXT: popl %esi
+; X32-AVX2-NEXT: .cfi_def_cfa_offset 4
; X32-AVX2-NEXT: retl
;
; X64-LABEL: PR15215_good:
diff --git a/test/CodeGen/X86/broadcastm-lowering.ll b/test/CodeGen/X86/broadcastm-lowering.ll
index 2a8236cf093..fc7b192c2f8 100644
--- a/test/CodeGen/X86/broadcastm-lowering.ll
+++ b/test/CodeGen/X86/broadcastm-lowering.ll
@@ -80,8 +80,7 @@ define <16 x i32> @test_mm512_epi32(<16 x i32> %a, <16 x i32> %b) {
; AVX512CD-LABEL: test_mm512_epi32:
; AVX512CD: # BB#0: # %entry
; AVX512CD-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; AVX512CD-NEXT: kmovw %k0, %eax
-; AVX512CD-NEXT: vpbroadcastd %eax, %zmm0
+; AVX512CD-NEXT: vpbroadcastmw2d %k0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512VLCDBW-LABEL: test_mm512_epi32:
@@ -110,9 +109,7 @@ define <8 x i64> @test_mm512_epi64(<8 x i32> %a, <8 x i32> %b) {
; AVX512CD-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; AVX512CD-NEXT: kmovw %k0, %eax
-; AVX512CD-NEXT: movzbl %al, %eax
-; AVX512CD-NEXT: vpbroadcastq %rax, %zmm0
+; AVX512CD-NEXT: vpbroadcastmb2q %k0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512VLCDBW-LABEL: test_mm512_epi64:
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index 82e133d2576..6f9abae6a71 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -247,10 +247,13 @@ define i32 @test12() ssp uwtable {
; CHECK-NEXT: # BB#1: # %T
; CHECK-NEXT: movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
; CHECK-NEXT: popq %rcx # encoding: [0x59]
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq # encoding: [0xc3]
; CHECK-NEXT: .LBB12_2: # %F
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movl $2, %eax # encoding: [0xb8,0x02,0x00,0x00,0x00]
; CHECK-NEXT: popq %rcx # encoding: [0x59]
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%tmp1 = call zeroext i1 @test12b()
diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll
index 9f7f8a97dc2..c5f03dbd5a3 100644
--- a/test/CodeGen/X86/combine-srl.ll
+++ b/test/CodeGen/X86/combine-srl.ll
@@ -175,7 +175,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) {
; SSE: # BB#0:
; SSE-NEXT: psrlq $48, %xmm1
; SSE-NEXT: psrlq $48, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: packusdw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_trunc_lshr0:
diff --git a/test/CodeGen/X86/compress_expand.ll b/test/CodeGen/X86/compress_expand.ll
index c6a1c07922e..9237544ea95 100644
--- a/test/CodeGen/X86/compress_expand.ll
+++ b/test/CodeGen/X86/compress_expand.ll
@@ -140,9 +140,7 @@ define void @test7(float* %base, <8 x float> %V, <8 x i1> %mask) {
; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
-; KNL-NEXT: vptestmq %zmm1, %zmm1, %k0
-; KNL-NEXT: kshiftlw $8, %k0, %k0
-; KNL-NEXT: kshiftrw $8, %k0, %k1
+; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1}
; KNL-NEXT: retq
call void @llvm.masked.compressstore.v8f32(<8 x float> %V, float* %base, <8 x i1> %mask)
diff --git a/test/CodeGen/X86/emutls-pie.ll b/test/CodeGen/X86/emutls-pie.ll
index 3c312a92669..f4561fcbd35 100644
--- a/test/CodeGen/X86/emutls-pie.ll
+++ b/test/CodeGen/X86/emutls-pie.ll
@@ -18,13 +18,16 @@ define i32 @my_get_xyz() {
; X32-NEXT: calll my_emutls_get_address@PLT
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $8, %esp
+; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
; X64-LABEL: my_get_xyz:
; X64: movq my_emutls_v_xyz@GOTPCREL(%rip), %rdi
; X64-NEXT: callq my_emutls_get_address@PLT
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
@@ -44,13 +47,16 @@ define i32 @f1() {
; X32-NEXT: calll __emutls_get_address@PLT
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $8, %esp
+; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
; X64-LABEL: f1:
; X64: leaq __emutls_v.i(%rip), %rdi
; X64-NEXT: callq __emutls_get_address@PLT
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/emutls.ll b/test/CodeGen/X86/emutls.ll
index 8c0ba903659..2321cd2fc28 100644
--- a/test/CodeGen/X86/emutls.ll
+++ b/test/CodeGen/X86/emutls.ll
@@ -16,12 +16,14 @@ define i32 @my_get_xyz() {
; X32-NEXT: calll my_emutls_get_address
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
; X64-LABEL: my_get_xyz:
; X64: movl $my_emutls_v_xyz, %edi
; X64-NEXT: callq my_emutls_get_address
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
@@ -45,12 +47,14 @@ define i32 @f1() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
; X64-LABEL: f1:
; X64: movl $__emutls_v.i1, %edi
; X64-NEXT: callq __emutls_get_address
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
@@ -63,11 +67,13 @@ define i32* @f2() {
; X32: movl $__emutls_v.i1, (%esp)
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
; X64-LABEL: f2:
; X64: movl $__emutls_v.i1, %edi
; X64-NEXT: callq __emutls_get_address
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
@@ -92,6 +98,7 @@ define i32* @f4() {
; X32: movl $__emutls_v.i2, (%esp)
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -116,6 +123,7 @@ define i32* @f6() {
; X32: movl $__emutls_v.i3, (%esp)
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -128,6 +136,7 @@ define i32 @f7() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -140,6 +149,7 @@ define i32* @f8() {
; X32: movl $__emutls_v.i4, (%esp)
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -152,6 +162,7 @@ define i32 @f9() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -164,6 +175,7 @@ define i32* @f10() {
; X32: movl $__emutls_v.i5, (%esp)
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -176,6 +188,7 @@ define i16 @f11() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movzwl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -189,6 +202,7 @@ define i32 @f12() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movswl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -203,6 +217,7 @@ define i8 @f13() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movb (%eax), %al
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -216,6 +231,7 @@ define i32 @f14() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movsbl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
diff --git a/test/CodeGen/X86/epilogue-cfi-fp.ll b/test/CodeGen/X86/epilogue-cfi-fp.ll
new file mode 100644
index 00000000000..c2fe1c7eaac
--- /dev/null
+++ b/test/CodeGen/X86/epilogue-cfi-fp.ll
@@ -0,0 +1,43 @@
+; RUN: llc -O0 %s -o - | FileCheck %s
+
+; ModuleID = 'epilogue-cfi-fp.c'
+source_filename = "epilogue-cfi-fp.c"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i686-pc-linux"
+
+; Function Attrs: noinline nounwind
+define i32 @foo(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m) #0 {
+
+; CHECK-LABEL: foo:
+; CHECK: popl %ebp
+; CHECK-NEXT: .cfi_def_cfa %esp, 4
+; CHECK-NEXT: retl
+
+entry:
+ %i.addr = alloca i32, align 4
+ %j.addr = alloca i32, align 4
+ %k.addr = alloca i32, align 4
+ %l.addr = alloca i32, align 4
+ %m.addr = alloca i32, align 4
+ store i32 %i, i32* %i.addr, align 4
+ store i32 %j, i32* %j.addr, align 4
+ store i32 %k, i32* %k.addr, align 4
+ store i32 %l, i32* %l.addr, align 4
+ store i32 %m, i32* %m.addr, align 4
+ ret i32 0
+}
+
+attributes #0 = { "no-frame-pointer-elim"="true" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (http://llvm.org/git/clang.git 3f8116e6a2815b1d5f3491493938d0c63c9f42c9) (http://llvm.org/git/llvm.git 4fde77f8f1a8e4482e69b6a7484bc7d1b99b3c0a)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "epilogue-cfi-fp.c", directory: "epilogue-dwarf/test")
+!2 = !{}
+!3 = !{i32 1, !"NumRegisterParameters", i32 0}
+!4 = !{i32 2, !"Dwarf Version", i32 4}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 7, !"PIC Level", i32 2}
+
diff --git a/test/CodeGen/X86/epilogue-cfi-no-fp.ll b/test/CodeGen/X86/epilogue-cfi-no-fp.ll
new file mode 100644
index 00000000000..79d6f478de8
--- /dev/null
+++ b/test/CodeGen/X86/epilogue-cfi-no-fp.ll
@@ -0,0 +1,46 @@
+; RUN: llc -O0 < %s | FileCheck %s
+
+; ModuleID = 'epilogue-cfi-no-fp.c'
+source_filename = "epilogue-cfi-no-fp.c"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i686-pc-linux"
+
+; Function Attrs: noinline nounwind
+define i32 @foo(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m) {
+; CHECK-LABEL: foo:
+; CHECK: addl $20, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 12
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: .cfi_def_cfa_offset 4
+; CHECK-NEXT: retl
+entry:
+ %i.addr = alloca i32, align 4
+ %j.addr = alloca i32, align 4
+ %k.addr = alloca i32, align 4
+ %l.addr = alloca i32, align 4
+ %m.addr = alloca i32, align 4
+ store i32 %i, i32* %i.addr, align 4
+ store i32 %j, i32* %j.addr, align 4
+ store i32 %k, i32* %k.addr, align 4
+ store i32 %l, i32* %l.addr, align 4
+ store i32 %m, i32* %m.addr, align 4
+ ret i32 0
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (http://llvm.org/git/clang.git 3f8116e6a2815b1d5f3491493938d0c63c9f42c9) (http://llvm.org/git/llvm.git 4fde77f8f1a8e4482e69b6a7484bc7d1b99b3c0a)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "epilogue-cfi-no-fp.c", directory: "epilogue-dwarf/test")
+!2 = !{}
+!3 = !{i32 1, !"NumRegisterParameters", i32 0}
+!4 = !{i32 2, !"Dwarf Version", i32 4}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 7, !"PIC Level", i32 2}
+
+
diff --git a/test/CodeGen/X86/f16c-intrinsics.ll b/test/CodeGen/X86/f16c-intrinsics.ll
index 712fe810d2a..64f8fd0ca8d 100644
--- a/test/CodeGen/X86/f16c-intrinsics.ll
+++ b/test/CodeGen/X86/f16c-intrinsics.ll
@@ -1,33 +1,81 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X32-AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X64-AVX512VL
define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
; X32-LABEL: test_x86_vcvtph2ps_128:
; X32: # BB#0:
-; X32-NEXT: vcvtph2ps %xmm0, %xmm0
-; X32-NEXT: retl
+; X32-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtph2ps_128:
; X64: # BB#0:
-; X64-NEXT: vcvtph2ps %xmm0, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_128:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
+define <4 x float> @test_x86_vcvtph2ps_128_m(<8 x i16>* nocapture %a) {
+; X32-LABEL: test_x86_vcvtph2ps_128_m:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vcvtph2ps_128_m:
+; X64: # BB#0:
+; X64-NEXT: vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_128_m:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128_m:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
+ %load = load <8 x i16>, <8 x i16>* %a
+ %res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %load) ; <<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) {
; X32-LABEL: test_x86_vcvtph2ps_256:
; X32: # BB#0:
-; X32-NEXT: vcvtph2ps %xmm0, %ymm0
-; X32-NEXT: retl
+; X32-NEXT: vcvtph2ps %xmm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtph2ps_256:
; X64: # BB#0:
-; X64-NEXT: vcvtph2ps %xmm0, %ymm0
-; X64-NEXT: retq
+; X64-NEXT: vcvtph2ps %xmm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_256:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_256:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -36,15 +84,26 @@ declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
define <8 x float> @test_x86_vcvtph2ps_256_m(<8 x i16>* nocapture %a) nounwind {
; X32-LABEL: test_x86_vcvtph2ps_256_m:
; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtph2ps (%eax), %ymm0
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtph2ps_256_m:
; X64: # BB#0:
-; X64-NEXT: vcvtph2ps (%rdi), %ymm0
-; X64-NEXT: retq
- %load = load <8 x i16>, <8 x i16>* %a, align 16
+; X64-NEXT: vcvtph2ps (%rdi), %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_256_m:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_256_m:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
+ %load = load <8 x i16>, <8 x i16>* %a
%res = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %load)
ret <8 x float> %res
}
@@ -52,13 +111,23 @@ define <8 x float> @test_x86_vcvtph2ps_256_m(<8 x i16>* nocapture %a) nounwind {
define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0) {
; X32-LABEL: test_x86_vcvtps2ph_128:
; X32: # BB#0:
-; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; X32-NEXT: retl
+; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128:
; X64: # BB#0:
-; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -67,15 +136,27 @@ declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly
define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) {
; X32-LABEL: test_x86_vcvtps2ph_256:
; X32: # BB#0:
-; X32-NEXT: vcvtps2ph $0, %ymm0, %xmm0
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
+; X32-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X32-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_256:
; X64: # BB#0:
-; X64-NEXT: vcvtps2ph $0, %ymm0, %xmm0
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_256:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_256:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -84,14 +165,25 @@ declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) {
; X32-LABEL: test_x86_vcvtps2ph_128_scalar:
; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtph2ps (%eax), %xmm0
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_scalar:
; X64: # BB#0:
-; X64-NEXT: vcvtph2ps (%rdi), %xmm0
-; X64-NEXT: retq
+; X64-NEXT: vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%load = load i64, i64* %ptr
%ins1 = insertelement <2 x i64> undef, i64 %load, i32 0
%ins2 = insertelement <2 x i64> %ins1, i64 0, i32 1
@@ -103,14 +195,25 @@ define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) {
define <4 x float> @test_x86_vcvtps2ph_128_scalar2(i64* %ptr) {
; X32-LABEL: test_x86_vcvtps2ph_128_scalar2:
; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtph2ps (%eax), %xmm0
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_scalar2:
; X64: # BB#0:
-; X64-NEXT: vcvtph2ps (%rdi), %xmm0
-; X64-NEXT: retq
+; X64-NEXT: vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar2:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar2:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%load = load i64, i64* %ptr
%ins = insertelement <2 x i64> undef, i64 %load, i32 0
%bc = bitcast <2 x i64> %ins to <8 x i16>
@@ -121,16 +224,29 @@ define <4 x float> @test_x86_vcvtps2ph_128_scalar2(i64* %ptr) {
define void @test_x86_vcvtps2ph_256_m(<8 x i16>* nocapture %d, <8 x float> %a) nounwind {
; X32-LABEL: test_x86_vcvtps2ph_256_m:
; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %ymm0, (%eax)
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %ymm0, (%eax) # encoding: [0xc4,0xe3,0x7d,0x1d,0x00,0x03]
+; X32-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_256_m:
; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %ymm0, (%rdi)
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $3, %ymm0, (%rdi) # encoding: [0xc4,0xe3,0x7d,0x1d,0x07,0x03]
+; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_256_m:
+; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0x00,0x03]
+; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_256_m:
+; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0x07,0x03]
+; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a, i32 3)
store <8 x i16> %0, <8 x i16>* %d, align 16
@@ -140,14 +256,31 @@ entry:
define void @test_x86_vcvtps2ph_128_m(<4 x i16>* nocapture %d, <4 x float> %a) nounwind {
; X32-LABEL: test_x86_vcvtps2ph_128_m:
; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax)
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_m:
; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi)
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m:
+; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03]
+; X32-AVX512VL-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0]
+; X32-AVX512VL-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X32-AVX512VL-NEXT: vpmovdw %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x33,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m:
+; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03]
+; X64-AVX512VL-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0]
+; X64-AVX512VL-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX512VL-NEXT: vpmovdw %xmm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x08,0x33,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a, i32 3)
%1 = shufflevector <8 x i16> %0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -158,14 +291,25 @@ entry:
define void @test_x86_vcvtps2ph_128_m2(double* nocapture %hf4x16, <4 x float> %f4x32) #0 {
; X32-LABEL: test_x86_vcvtps2ph_128_m2:
; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax)
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_m2:
; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi)
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m2:
+; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m2:
+; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %f4x32, i32 3)
%1 = bitcast <8 x i16> %0 to <2 x double>
@@ -177,14 +321,25 @@ entry:
define void @test_x86_vcvtps2ph_128_m3(i64* nocapture %hf4x16, <4 x float> %f4x32) #0 {
; X32-LABEL: test_x86_vcvtps2ph_128_m3:
; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax)
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_m3:
; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi)
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m3:
+; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m3:
+; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %f4x32, i32 3)
%1 = bitcast <8 x i16> %0 to <2 x i64>
diff --git a/test/CodeGen/X86/fast-isel-int-float-conversion.ll b/test/CodeGen/X86/fast-isel-int-float-conversion.ll
index 3e69710868b..57b50abab53 100644
--- a/test/CodeGen/X86/fast-isel-int-float-conversion.ll
+++ b/test/CodeGen/X86/fast-isel-int-float-conversion.ll
@@ -31,6 +31,7 @@ define double @int_to_double_rr(i32 %a) {
; SSE2_X86-NEXT: fldl (%esp)
; SSE2_X86-NEXT: movl %ebp, %esp
; SSE2_X86-NEXT: popl %ebp
+; SSE2_X86-NEXT: .cfi_def_cfa %esp, 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_double_rr:
@@ -47,6 +48,7 @@ define double @int_to_double_rr(i32 %a) {
; AVX_X86-NEXT: fldl (%esp)
; AVX_X86-NEXT: movl %ebp, %esp
; AVX_X86-NEXT: popl %ebp
+; AVX_X86-NEXT: .cfi_def_cfa %esp, 4
; AVX_X86-NEXT: retl
entry:
%0 = sitofp i32 %a to double
@@ -80,6 +82,7 @@ define double @int_to_double_rm(i32* %a) {
; SSE2_X86-NEXT: fldl (%esp)
; SSE2_X86-NEXT: movl %ebp, %esp
; SSE2_X86-NEXT: popl %ebp
+; SSE2_X86-NEXT: .cfi_def_cfa %esp, 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_double_rm:
@@ -97,6 +100,7 @@ define double @int_to_double_rm(i32* %a) {
; AVX_X86-NEXT: fldl (%esp)
; AVX_X86-NEXT: movl %ebp, %esp
; AVX_X86-NEXT: popl %ebp
+; AVX_X86-NEXT: .cfi_def_cfa %esp, 4
; AVX_X86-NEXT: retl
entry:
%0 = load i32, i32* %a
@@ -130,6 +134,7 @@ define double @int_to_double_rm_optsize(i32* %a) optsize {
; SSE2_X86-NEXT: fldl (%esp)
; SSE2_X86-NEXT: movl %ebp, %esp
; SSE2_X86-NEXT: popl %ebp
+; SSE2_X86-NEXT: .cfi_def_cfa %esp, 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_double_rm_optsize:
@@ -147,6 +152,7 @@ define double @int_to_double_rm_optsize(i32* %a) optsize {
; AVX_X86-NEXT: fldl (%esp)
; AVX_X86-NEXT: movl %ebp, %esp
; AVX_X86-NEXT: popl %ebp
+; AVX_X86-NEXT: .cfi_def_cfa %esp, 4
; AVX_X86-NEXT: retl
entry:
%0 = load i32, i32* %a
@@ -174,6 +180,7 @@ define float @int_to_float_rr(i32 %a) {
; SSE2_X86-NEXT: movss %xmm0, (%esp)
; SSE2_X86-NEXT: flds (%esp)
; SSE2_X86-NEXT: popl %eax
+; SSE2_X86-NEXT: .cfi_def_cfa_offset 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_float_rr:
@@ -184,6 +191,7 @@ define float @int_to_float_rr(i32 %a) {
; AVX_X86-NEXT: vmovss %xmm0, (%esp)
; AVX_X86-NEXT: flds (%esp)
; AVX_X86-NEXT: popl %eax
+; AVX_X86-NEXT: .cfi_def_cfa_offset 4
; AVX_X86-NEXT: retl
entry:
%0 = sitofp i32 %a to float
@@ -211,6 +219,7 @@ define float @int_to_float_rm(i32* %a) {
; SSE2_X86-NEXT: movss %xmm0, (%esp)
; SSE2_X86-NEXT: flds (%esp)
; SSE2_X86-NEXT: popl %eax
+; SSE2_X86-NEXT: .cfi_def_cfa_offset 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_float_rm:
@@ -222,6 +231,7 @@ define float @int_to_float_rm(i32* %a) {
; AVX_X86-NEXT: vmovss %xmm0, (%esp)
; AVX_X86-NEXT: flds (%esp)
; AVX_X86-NEXT: popl %eax
+; AVX_X86-NEXT: .cfi_def_cfa_offset 4
; AVX_X86-NEXT: retl
entry:
%0 = load i32, i32* %a
@@ -249,6 +259,7 @@ define float @int_to_float_rm_optsize(i32* %a) optsize {
; SSE2_X86-NEXT: movss %xmm0, (%esp)
; SSE2_X86-NEXT: flds (%esp)
; SSE2_X86-NEXT: popl %eax
+; SSE2_X86-NEXT: .cfi_def_cfa_offset 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_float_rm_optsize:
@@ -260,6 +271,7 @@ define float @int_to_float_rm_optsize(i32* %a) optsize {
; AVX_X86-NEXT: vmovss %xmm0, (%esp)
; AVX_X86-NEXT: flds (%esp)
; AVX_X86-NEXT: popl %eax
+; AVX_X86-NEXT: .cfi_def_cfa_offset 4
; AVX_X86-NEXT: retl
entry:
%0 = load i32, i32* %a
diff --git a/test/CodeGen/X86/fast-isel-store.ll b/test/CodeGen/X86/fast-isel-store.ll
index e359e620563..e2412e9c5c0 100644
--- a/test/CodeGen/X86/fast-isel-store.ll
+++ b/test/CodeGen/X86/fast-isel-store.ll
@@ -375,6 +375,7 @@ define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double
; SSE64-NEXT: movupd %xmm0, (%eax)
; SSE64-NEXT: movupd %xmm1, 16(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_4xf64:
@@ -413,6 +414,7 @@ define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4
; SSE64-NEXT: movapd %xmm0, (%eax)
; SSE64-NEXT: movapd %xmm1, 16(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_4xf64_aligned:
@@ -452,6 +454,7 @@ define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %va
; SSE64-NEXT: movups %xmm2, 32(%eax)
; SSE64-NEXT: movups %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xi32:
@@ -501,6 +504,7 @@ define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x
; SSE64-NEXT: movaps %xmm2, 32(%eax)
; SSE64-NEXT: movaps %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xi32_aligned:
@@ -550,6 +554,7 @@ define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x floa
; SSE64-NEXT: movups %xmm2, 32(%eax)
; SSE64-NEXT: movups %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xf32:
@@ -599,6 +604,7 @@ define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <1
; SSE64-NEXT: movaps %xmm2, 32(%eax)
; SSE64-NEXT: movaps %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xf32_aligned:
@@ -656,6 +662,7 @@ define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double
; SSE64-NEXT: movupd %xmm2, 32(%eax)
; SSE64-NEXT: movupd %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_8xf64:
@@ -682,6 +689,7 @@ define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double
; AVXONLY64-NEXT: vmovupd %ymm1, 32(%eax)
; AVXONLY64-NEXT: movl %ebp, %esp
; AVXONLY64-NEXT: popl %ebp
+; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4
; AVXONLY64-NEXT: retl
;
; AVX51232-LABEL: test_store_8xf64:
@@ -729,6 +737,7 @@ define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8
; SSE64-NEXT: movapd %xmm2, 32(%eax)
; SSE64-NEXT: movapd %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_8xf64_aligned:
@@ -755,6 +764,7 @@ define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8
; AVXONLY64-NEXT: vmovapd %ymm1, 32(%eax)
; AVXONLY64-NEXT: movl %ebp, %esp
; AVXONLY64-NEXT: popl %ebp
+; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4
; AVXONLY64-NEXT: retl
;
; AVX51232-LABEL: test_store_8xf64_aligned:
diff --git a/test/CodeGen/X86/fma-intrinsics-x86.ll b/test/CodeGen/X86/fma-intrinsics-x86.ll
index 68f39469a82..362864f72a9 100644
--- a/test/CodeGen/X86/fma-intrinsics-x86.ll
+++ b/test/CodeGen/X86/fma-intrinsics-x86.ll
@@ -1,29 +1,32 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
-; RUN: llc < %s -mtriple=x86_64-pc-windows -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX512VL
+; RUN: llc < %s -mtriple=x86_64-pc-windows -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma4,-fma -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
; VFMADD
define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6a,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -31,21 +34,27 @@ define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xa9,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_bac_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6a,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
@@ -54,20 +63,25 @@ declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float
define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6b,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -75,21 +89,27 @@ define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1,
define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_bac_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6b,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
@@ -98,20 +118,25 @@ declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x do
define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x68,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -120,20 +145,25 @@ declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float
define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x69,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -142,20 +172,25 @@ declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x do
define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x68,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -164,20 +199,25 @@ declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x f
define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x69,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -187,20 +227,25 @@ declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4
define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xab,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xab,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6e,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -208,21 +253,27 @@ define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xab,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_bac_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6e,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
@@ -231,20 +282,25 @@ declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float
define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6f,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -252,21 +308,27 @@ define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1,
define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xab,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_bac_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6f,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
@@ -275,20 +337,25 @@ declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x do
define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -297,20 +364,25 @@ declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float
define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -319,20 +391,25 @@ declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x do
define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x6c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -341,20 +418,25 @@ declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x f
define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x6d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -364,20 +446,25 @@ declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4
define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xad,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xad,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7a,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -385,21 +472,27 @@ define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xad,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_bac_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7a,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
@@ -408,20 +501,25 @@ declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7b,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -429,21 +527,27 @@ define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1,
define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xad,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_bac_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7b,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
@@ -452,20 +556,25 @@ declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x d
define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x78,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -474,20 +583,25 @@ declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x79,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -496,20 +610,25 @@ declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x d
define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x78,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -518,20 +637,25 @@ declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x
define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x79,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -541,20 +665,25 @@ declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4
define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7e,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -562,21 +691,27 @@ define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xaf,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_bac_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7e,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
@@ -585,20 +720,25 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7f,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -606,21 +746,27 @@ define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1,
define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_bac_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7f,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
@@ -629,20 +775,25 @@ declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x d
define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -651,20 +802,25 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -673,20 +829,25 @@ declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x d
define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x7c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -695,20 +856,25 @@ declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x
define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x7d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -718,20 +884,25 @@ declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4
define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -740,20 +911,25 @@ declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x fl
define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -762,20 +938,25 @@ declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x
define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -784,20 +965,25 @@ declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8
define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -807,20 +993,25 @@ declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>,
define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5e,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -829,20 +1020,25 @@ declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x fl
define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5f,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -851,20 +1047,25 @@ declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x
define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5e,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -873,20 +1074,25 @@ declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8
define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5f,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll
index ba80c839fdd..ee64790d1d9 100644
--- a/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll
+++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll
@@ -18,11 +18,15 @@ entry:
}
; CHECK-LABEL: noDebug
-; CHECK: addq $24, %rsp
-; CHECK: popq %rbx
-; CHECK-NEXT: popq %r14
-; CHECK-NEXT: retq
-
+; CHECK: addq $16, %rsp
+; CHECK-NEXT: .cfi_adjust_cfa_offset -16
+; CHECK-NEXT: addq $8, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
define void @withDebug() !dbg !18 {
entry:
@@ -42,9 +46,11 @@ entry:
; CHECK-LABEL: withDebug
; CHECK: callq printf
; CHECK: callq printf
-; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: addq $16, %rsp
; CHECK: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)
diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll
index f9ecf707810..de9d6bf93d6 100644
--- a/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll
+++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll
@@ -9,6 +9,7 @@ define i64 @fn1NoDebug(i64 %a) {
; CHECK-LABEL: fn1NoDebug
; CHECK: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: ret
define i64 @fn1WithDebug(i64 %a) !dbg !4 {
@@ -19,6 +20,7 @@ define i64 @fn1WithDebug(i64 %a) !dbg !4 {
; CHECK-LABEL: fn1WithDebug
; CHECK: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: ret
%struct.Buffer = type { i8, [63 x i8] }
@@ -33,6 +35,7 @@ define void @fn2NoDebug(%struct.Buffer* byval align 64 %p1) {
; CHECK-NOT: sub
; CHECK: mov
; CHECK-NEXT: pop
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: ret
define void @fn2WithDebug(%struct.Buffer* byval align 64 %p1) !dbg !8 {
@@ -46,6 +49,7 @@ define void @fn2WithDebug(%struct.Buffer* byval align 64 %p1) !dbg !8 {
; CHECK-NOT: sub
; CHECK: mov
; CHECK-NEXT: pop
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: ret
declare i64 @fn(i64, i64)
diff --git a/test/CodeGen/X86/haddsub-2.ll b/test/CodeGen/X86/haddsub-2.ll
index e32c7452b0c..7126fb233e6 100644
--- a/test/CodeGen/X86/haddsub-2.ll
+++ b/test/CodeGen/X86/haddsub-2.ll
@@ -724,11 +724,17 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE3-NEXT: popq %rbx
+; SSE3-NEXT: .cfi_def_cfa_offset 48
; SSE3-NEXT: popq %r12
+; SSE3-NEXT: .cfi_def_cfa_offset 40
; SSE3-NEXT: popq %r13
+; SSE3-NEXT: .cfi_def_cfa_offset 32
; SSE3-NEXT: popq %r14
+; SSE3-NEXT: .cfi_def_cfa_offset 24
; SSE3-NEXT: popq %r15
+; SSE3-NEXT: .cfi_def_cfa_offset 16
; SSE3-NEXT: popq %rbp
+; SSE3-NEXT: .cfi_def_cfa_offset 8
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_vphadd_w_test:
@@ -1351,11 +1357,17 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE3-NEXT: popq %rbx
+; SSE3-NEXT: .cfi_def_cfa_offset 48
; SSE3-NEXT: popq %r12
+; SSE3-NEXT: .cfi_def_cfa_offset 40
; SSE3-NEXT: popq %r13
+; SSE3-NEXT: .cfi_def_cfa_offset 32
; SSE3-NEXT: popq %r14
+; SSE3-NEXT: .cfi_def_cfa_offset 24
; SSE3-NEXT: popq %r15
+; SSE3-NEXT: .cfi_def_cfa_offset 16
; SSE3-NEXT: popq %rbp
+; SSE3-NEXT: .cfi_def_cfa_offset 8
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_hadd_w:
diff --git a/test/CodeGen/X86/hipe-cc64.ll b/test/CodeGen/X86/hipe-cc64.ll
index efe07cf6301..ce2d0e9c671 100644
--- a/test/CodeGen/X86/hipe-cc64.ll
+++ b/test/CodeGen/X86/hipe-cc64.ll
@@ -87,6 +87,7 @@ define cc 11 { i64, i64, i64 } @tailcaller(i64 %hp, i64 %p) #0 {
; CHECK-NEXT: movl $47, %ecx
; CHECK-NEXT: movl $63, %r8d
; CHECK-NEXT: popq %rax
+ ; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: jmp tailcallee
%ret = tail call cc11 { i64, i64, i64 } @tailcallee(i64 %hp, i64 %p, i64 15,
i64 31, i64 47, i64 63, i64 79) #1
diff --git a/test/CodeGen/X86/horizontal-reduce-smax.ll b/test/CodeGen/X86/horizontal-reduce-smax.ll
new file mode 100644
index 00000000000..8f5aac493b5
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-smax.ll
@@ -0,0 +1,1896 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp sgt <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp sgt <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v8i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v8i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp sgt <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v16i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v16i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp sgt <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm6
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm6, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm5
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT: movapd %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm9, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm6
+; X64-SSE2-NEXT: por %xmm0, %xmm6
+; X64-SSE2-NEXT: pand %xmm8, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm8
+; X64-SSE2-NEXT: por %xmm1, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm8, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm6
+; X64-SSE2-NEXT: pandn %xmm8, %xmm1
+; X64-SSE2-NEXT: por %xmm6, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm5
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT: movapd %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm5
+; X86-SSE2-NEXT: por %xmm1, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm4
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm4, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxsd %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxsd %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm5
+; X64-SSE2-NEXT: por %xmm1, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm4
+; X64-SSE2-NEXT: pandn %xmm5, %xmm0
+; X64-SSE2-NEXT: por %xmm4, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxsd %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxsd %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v32i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pmaxsw %xmm3, %xmm1
+; X86-SSE-NEXT: pmaxsw %xmm2, %xmm0
+; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v32i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pmaxsw %xmm3, %xmm1
+; X64-SSE-NEXT: pmaxsw %xmm2, %xmm0
+; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp sgt <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm5
+; X86-SSE2-NEXT: por %xmm1, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm4
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm4, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxsb %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X64-SSE2-NEXT: pcmpgtb %xmm3, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm5
+; X64-SSE2-NEXT: por %xmm1, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm4
+; X64-SSE2-NEXT: pandn %xmm5, %xmm0
+; X64-SSE2-NEXT: por %xmm4, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxsb %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp sgt <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp sgt <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/horizontal-reduce-smin.ll b/test/CodeGen/X86/horizontal-reduce-smin.ll
new file mode 100644
index 00000000000..6feb963426b
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-smin.ll
@@ -0,0 +1,1898 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp slt <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp slt <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v8i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v8i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp slt <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v16i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v16i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp slt <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm6, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm5
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movapd %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm5
+; X64-SSE2-NEXT: pxor %xmm9, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm9, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm9, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm9, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm9, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm9
+; X64-SSE2-NEXT: movdqa %xmm9, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm9
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm4, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm5
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movapd %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm4
+; X86-SSE2-NEXT: por %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm4, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminsd %xmm3, %xmm1
+; X86-SSE42-NEXT: pminsd %xmm2, %xmm0
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm4
+; X64-SSE2-NEXT: por %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm5
+; X64-SSE2-NEXT: pandn %xmm4, %xmm0
+; X64-SSE2-NEXT: por %xmm5, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminsd %xmm3, %xmm1
+; X64-SSE42-NEXT: pminsd %xmm2, %xmm0
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v32i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pminsw %xmm3, %xmm1
+; X86-SSE-NEXT: pminsw %xmm2, %xmm0
+; X86-SSE-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v32i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pminsw %xmm3, %xmm1
+; X64-SSE-NEXT: pminsw %xmm2, %xmm0
+; X64-SSE-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp slt <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm4
+; X86-SSE2-NEXT: por %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm4, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminsb %xmm3, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm2, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm4
+; X64-SSE2-NEXT: por %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm5
+; X64-SSE2-NEXT: pandn %xmm4, %xmm0
+; X64-SSE2-NEXT: por %xmm5, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminsb %xmm3, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm2, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp slt <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp slt <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/horizontal-reduce-umax.ll b/test/CodeGen/X86/horizontal-reduce-umax.ll
new file mode 100644
index 00000000000..ee9d8955cb5
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -0,0 +1,2203 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm2, %xmm3
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X86-AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm2, %xmm3
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp ugt <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm3, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm3, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ugt <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm3, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pxor %xmm1, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm1, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm3, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: movd %xmm3, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE42-NEXT: pxor %xmm3, %xmm4
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm2, %xmm3
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE42-NEXT: pxor %xmm3, %xmm4
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm2, %xmm3
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ugt <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm4, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm4, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movd %xmm3, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm4, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm3, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm4, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm3, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ugt <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm6
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm6, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm6 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm6, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE42-NEXT: pxor %xmm6, %xmm5
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm5
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm7
+; X86-SSE42-NEXT: pxor %xmm6, %xmm7
+; X86-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT: pxor %xmm6, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movapd %xmm3, %xmm1
+; X86-SSE42-NEXT: xorpd %xmm6, %xmm1
+; X86-SSE42-NEXT: movapd %xmm2, %xmm0
+; X86-SSE42-NEXT: xorpd %xmm6, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm6, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm6
+; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm9, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm6
+; X64-SSE2-NEXT: por %xmm0, %xmm6
+; X64-SSE2-NEXT: pand %xmm8, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm8
+; X64-SSE2-NEXT: por %xmm1, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm8, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm6
+; X64-SSE2-NEXT: pandn %xmm8, %xmm1
+; X64-SSE2-NEXT: por %xmm6, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm6, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm5
+; X64-SSE42-NEXT: pxor %xmm6, %xmm5
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm5
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm7
+; X64-SSE42-NEXT: pxor %xmm6, %xmm7
+; X64-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE42-NEXT: pxor %xmm6, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movapd %xmm3, %xmm1
+; X64-SSE42-NEXT: xorpd %xmm6, %xmm1
+; X64-SSE42-NEXT: movapd %xmm2, %xmm0
+; X64-SSE42-NEXT: xorpd %xmm6, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm6, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm6
+; X64-SSE42-NEXT: pcmpgtq %xmm6, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm7
+; X86-SSE2-NEXT: por %xmm0, %xmm7
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm7
+; X86-SSE2-NEXT: pandn %xmm6, %xmm1
+; X86-SSE2-NEXT: por %xmm7, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxud %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxud %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm7
+; X64-SSE2-NEXT: por %xmm0, %xmm7
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm7
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm7, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxud %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxud %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: pcmpgtw %xmm5, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtw %xmm5, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm7
+; X86-SSE2-NEXT: por %xmm0, %xmm7
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm7
+; X86-SSE2-NEXT: pandn %xmm6, %xmm1
+; X86-SSE2-NEXT: por %xmm7, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxuw %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: pcmpgtw %xmm5, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtw %xmm5, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm7
+; X64-SSE2-NEXT: por %xmm0, %xmm7
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm7
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm7, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxuw %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ugt <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pmaxub %xmm3, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm2, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxub %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pmaxub %xmm3, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm2, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxub %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ugt <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp ugt <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/horizontal-reduce-umin.ll b/test/CodeGen/X86/horizontal-reduce-umin.ll
new file mode 100644
index 00000000000..43369673042
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -0,0 +1,2207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE42-NEXT: pxor %xmm0, %xmm3
+; X86-SSE42-NEXT: pxor %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X86-AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE42-NEXT: pxor %xmm0, %xmm3
+; X64-SSE42-NEXT: pxor %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp ult <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ult <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm4, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm3, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pxor %xmm1, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm4, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm3, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE42-NEXT: pxor %xmm3, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm2, %xmm3
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm3
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE42-NEXT: pxor %xmm3, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm2, %xmm3
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm3
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ult <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm4, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm3, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm3, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm4, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm4
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm4
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ult <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm6, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm5
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: pxor %xmm4, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE42-NEXT: pxor %xmm4, %xmm6
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm6
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm7
+; X86-SSE42-NEXT: pxor %xmm4, %xmm7
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm4, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movdqa %xmm6, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2
+; X86-SSE42-NEXT: movapd %xmm2, %xmm1
+; X86-SSE42-NEXT: xorpd %xmm4, %xmm1
+; X86-SSE42-NEXT: movapd %xmm3, %xmm0
+; X86-SSE42-NEXT: xorpd %xmm4, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm4, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm4
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm5
+; X64-SSE2-NEXT: pxor %xmm9, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm9, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm9, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm9, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm9, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm9
+; X64-SSE2-NEXT: movdqa %xmm9, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm9
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm4, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm5
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: pxor %xmm4, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE42-NEXT: pxor %xmm4, %xmm6
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm6
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm7
+; X64-SSE42-NEXT: pxor %xmm4, %xmm7
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm4, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movdqa %xmm6, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2
+; X64-SSE42-NEXT: movapd %xmm2, %xmm1
+; X64-SSE42-NEXT: xorpd %xmm4, %xmm1
+; X64-SSE42-NEXT: movapd %xmm3, %xmm0
+; X64-SSE42-NEXT: xorpd %xmm4, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm4, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm4
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm7
+; X86-SSE2-NEXT: por %xmm1, %xmm7
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm5
+; X86-SSE2-NEXT: pandn %xmm7, %xmm1
+; X86-SSE2-NEXT: por %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm4
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: movd %xmm4, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminud %xmm3, %xmm1
+; X86-SSE42-NEXT: pminud %xmm2, %xmm0
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm7
+; X64-SSE2-NEXT: por %xmm1, %xmm7
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm7, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm4
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: movd %xmm4, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminud %xmm3, %xmm1
+; X64-SSE42-NEXT: pminud %xmm2, %xmm0
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtw %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtw %xmm6, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm7
+; X86-SSE2-NEXT: por %xmm1, %xmm7
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm5
+; X86-SSE2-NEXT: pandn %xmm7, %xmm1
+; X86-SSE2-NEXT: por %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm3, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm4
+; X86-SSE2-NEXT: por %xmm2, %xmm4
+; X86-SSE2-NEXT: movd %xmm4, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminuw %xmm3, %xmm1
+; X86-SSE42-NEXT: pminuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtw %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtw %xmm6, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm7
+; X64-SSE2-NEXT: por %xmm1, %xmm7
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm7, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm3, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm4
+; X64-SSE2-NEXT: por %xmm2, %xmm4
+; X64-SSE2-NEXT: movd %xmm4, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminuw %xmm3, %xmm1
+; X64-SSE42-NEXT: pminuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ult <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pminub %xmm3, %xmm1
+; X86-SSE2-NEXT: pminub %xmm2, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminub %xmm3, %xmm1
+; X86-SSE42-NEXT: pminub %xmm2, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pminub %xmm3, %xmm1
+; X64-SSE2-NEXT: pminub %xmm2, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminub %xmm3, %xmm1
+; X64-SSE42-NEXT: pminub %xmm2, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ult <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp ult <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index fd503aa6c6e..e3b25a539c1 100644
--- a/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -81,6 +81,7 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
; X86-NEXT: orl %edx, %eax
; X86-NEXT: movw %ax, (%ecx)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: i24_insert_bit:
diff --git a/test/CodeGen/X86/imul.ll b/test/CodeGen/X86/imul.ll
index e364b001f94..02782f72108 100644
--- a/test/CodeGen/X86/imul.ll
+++ b/test/CodeGen/X86/imul.ll
@@ -307,6 +307,7 @@ define i64 @test5(i64 %a) {
; X86-NEXT: subl %ecx, %edx
; X86-NEXT: subl %esi, %edx
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%tmp3 = mul i64 %a, -31
@@ -362,6 +363,7 @@ define i64 @test7(i64 %a) {
; X86-NEXT: subl %ecx, %edx
; X86-NEXT: subl %esi, %edx
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%tmp3 = mul i64 %a, -33
@@ -390,6 +392,7 @@ define i64 @testOverflow(i64 %a) {
; X86-NEXT: addl %esi, %edx
; X86-NEXT: subl {{[0-9]+}}(%esp), %edx
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%tmp3 = mul i64 %a, 9223372036854775807
diff --git a/test/CodeGen/X86/inline-asm-A-constraint.ll b/test/CodeGen/X86/inline-asm-A-constraint.ll
index 2ad011e88e0..7975b318eff 100644
--- a/test/CodeGen/X86/inline-asm-A-constraint.ll
+++ b/test/CodeGen/X86/inline-asm-A-constraint.ll
@@ -19,8 +19,7 @@ entry:
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1
ret { i64, i64 } %.fca.1.insert
}
-; CHECK: lock
-; CHECK-NEXT: cmpxchg16b
+; CHECK: lock cmpxchg16b
attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind }
diff --git a/test/CodeGen/X86/lea-opt-cse1.ll b/test/CodeGen/X86/lea-opt-cse1.ll
index 05b47690e81..4c9ec3e0d7a 100644
--- a/test/CodeGen/X86/lea-opt-cse1.ll
+++ b/test/CodeGen/X86/lea-opt-cse1.ll
@@ -30,6 +30,7 @@ define void @test_func(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr {
; X86-NEXT: leal 1(%edx,%ecx), %ecx
; X86-NEXT: movl %ecx, 16(%eax)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0
diff --git a/test/CodeGen/X86/lea-opt-cse2.ll b/test/CodeGen/X86/lea-opt-cse2.ll
index 865dd49a6e1..cee6f6792cb 100644
--- a/test/CodeGen/X86/lea-opt-cse2.ll
+++ b/test/CodeGen/X86/lea-opt-cse2.ll
@@ -46,7 +46,9 @@ define void @foo(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0 {
; X86-NEXT: leal 1(%esi,%edx), %ecx
; X86-NEXT: movl %ecx, 16(%eax)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
br label %loop
diff --git a/test/CodeGen/X86/lea-opt-cse3.ll b/test/CodeGen/X86/lea-opt-cse3.ll
index 87949b40d48..ed3aff98036 100644
--- a/test/CodeGen/X86/lea-opt-cse3.ll
+++ b/test/CodeGen/X86/lea-opt-cse3.ll
@@ -91,6 +91,7 @@ define i32 @foo1_mult_basic_blocks(i32 %a, i32 %b) local_unnamed_addr #0 {
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: .LBB2_2: # %exit
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%mul = shl i32 %b, 2
@@ -143,6 +144,7 @@ define i32 @foo1_mult_basic_blocks_illegal_scale(i32 %a, i32 %b) local_unnamed_a
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: .LBB3_2: # %exit
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%mul = shl i32 %b, 1
diff --git a/test/CodeGen/X86/lea-opt-cse4.ll b/test/CodeGen/X86/lea-opt-cse4.ll
index 31f31a73d44..d068180c39c 100644
--- a/test/CodeGen/X86/lea-opt-cse4.ll
+++ b/test/CodeGen/X86/lea-opt-cse4.ll
@@ -36,6 +36,7 @@ define void @foo(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0 {
; X86-NEXT: leal 1(%ecx,%edx), %ecx
; X86-NEXT: movl %ecx, 16(%eax)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0
@@ -110,7 +111,9 @@ define void @foo_loop(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: movl %edx, 16(%eax)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
br label %loop
diff --git a/test/CodeGen/X86/legalize-shift-64.ll b/test/CodeGen/X86/legalize-shift-64.ll
index ca4cfa5b805..7dff2c20d5a 100644
--- a/test/CodeGen/X86/legalize-shift-64.ll
+++ b/test/CodeGen/X86/legalize-shift-64.ll
@@ -117,9 +117,13 @@ define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) {
; CHECK-NEXT: movl %esi, 4(%eax)
; CHECK-NEXT: movl %edi, (%eax)
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: popl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 12
; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: popl %ebp
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl $4
%shl = shl <2 x i64> %A, %B
ret <2 x i64> %shl
@@ -160,6 +164,7 @@ define i32 @test6() {
; CHECK-NEXT: .LBB5_4: # %if.then
; CHECK-NEXT: movl %ebp, %esp
; CHECK-NEXT: popl %ebp
+; CHECK-NEXT: .cfi_def_cfa %esp, 4
; CHECK-NEXT: retl
%x = alloca i32, align 4
%t = alloca i64, align 8
diff --git a/test/CodeGen/X86/live-out-reg-info.ll b/test/CodeGen/X86/live-out-reg-info.ll
index b838065beea..170f73593f6 100644
--- a/test/CodeGen/X86/live-out-reg-info.ll
+++ b/test/CodeGen/X86/live-out-reg-info.ll
@@ -18,6 +18,7 @@ define void @foo(i32 %a) {
; CHECK-NEXT: callq qux
; CHECK-NEXT: .LBB0_2: # %false
; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%t0 = lshr i32 %a, 23
br label %next
diff --git a/test/CodeGen/X86/load-combine.ll b/test/CodeGen/X86/load-combine.ll
index d1f5f41ac7b..d46efc4b5ec 100644
--- a/test/CodeGen/X86/load-combine.ll
+++ b/test/CodeGen/X86/load-combine.ll
@@ -376,6 +376,7 @@ define i32 @load_i32_by_i8_bswap_uses(i32* %arg) {
; CHECK-NEXT: orl %ecx, %eax
; CHECK-NEXT: orl %edx, %eax
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_bswap_uses:
@@ -496,6 +497,7 @@ define i32 @load_i32_by_i8_bswap_store_in_between(i32* %arg, i32* %arg1) {
; CHECK-NEXT: movzbl 3(%ecx), %eax
; CHECK-NEXT: orl %edx, %eax
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_bswap_store_in_between:
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll
index 8983c3acb53..207175aae1a 100644
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -1057,9 +1057,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
; SKX: # BB#0:
; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX-NEXT: kshiftlb $6, %k0, %k0
-; SKX-NEXT: kshiftrb $6, %k0, %k1
+; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -1068,9 +1066,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
; SKX_32: # BB#0:
; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX_32-NEXT: kshiftlb $6, %k0, %k0
-; SKX_32-NEXT: kshiftrb $6, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
@@ -1105,9 +1101,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
; SKX: # BB#0:
; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX-NEXT: kshiftlb $6, %k0, %k0
-; SKX-NEXT: kshiftrb $6, %k0, %k1
+; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: vzeroupper
@@ -1117,9 +1111,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
; SKX_32: # BB#0:
; SKX_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX_32-NEXT: kshiftlb $6, %k0, %k0
-; SKX_32-NEXT: kshiftrb $6, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX_32-NEXT: vzeroupper
@@ -1165,9 +1157,7 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
; SKX: # BB#0:
; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vptestmq %xmm1, %xmm1, %k0
-; SKX-NEXT: kshiftlb $6, %k0, %k0
-; SKX-NEXT: kshiftrb $6, %k0, %k1
+; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
@@ -1176,9 +1166,7 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
; SKX_32: # BB#0:
; SKX_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k0
-; SKX_32-NEXT: kshiftlb $6, %k0, %k0
-; SKX_32-NEXT: kshiftrb $6, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
; SKX_32-NEXT: vmovaps %xmm2, %xmm0
@@ -1702,6 +1690,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: .cfi_def_cfa %esp, 4
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_gather_16i64:
@@ -1736,6 +1725,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; SKX_32-NEXT: retl
%res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
ret <16 x i64> %res
@@ -1819,6 +1809,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; KNL_32-NEXT: vmovapd %zmm2, %zmm0
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: .cfi_def_cfa %esp, 4
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_gather_16f64:
@@ -1853,6 +1844,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; SKX_32-NEXT: vmovapd %zmm2, %zmm0
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; SKX_32-NEXT: retl
%res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
ret <16 x double> %res
@@ -1934,6 +1926,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: .cfi_def_cfa %esp, 4
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
@@ -1967,6 +1960,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
@@ -2050,6 +2044,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: .cfi_def_cfa %esp, 4
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
@@ -2083,6 +2078,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
@@ -2127,6 +2123,7 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: .cfi_def_cfa %esp, 4
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_pr28312:
@@ -2154,6 +2151,7 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; SKX_32-NEXT: retl
%g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
%g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll
index 3e257f5fd85..f43e3f6f56e 100644
--- a/test/CodeGen/X86/masked_memop.ll
+++ b/test/CodeGen/X86/masked_memop.ll
@@ -285,9 +285,7 @@ define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k0
-; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512F-NEXT: retq
@@ -327,9 +325,7 @@ define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
; AVX512F: ## BB#0:
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k0
-; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512F-NEXT: retq
@@ -369,9 +365,7 @@ define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
; AVX512F: ## BB#0:
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k0
-; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512F-NEXT: retq
diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll
index 77d9fa69182..3f5eeba7055 100644
--- a/test/CodeGen/X86/memcmp-optsize.ll
+++ b/test/CodeGen/X86/memcmp-optsize.ll
@@ -156,36 +156,36 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length3_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzwl (%eax), %edx
-; X86-NEXT: cmpw (%ecx), %dx
-; X86-NEXT: jne .LBB5_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 2(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 2(%ecx), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %edx
+; X86-NEXT: cmpw (%eax), %dx
+; X86-NEXT: jne .LBB5_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movb 2(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 2(%eax), %dl
; X86-NEXT: je .LBB5_3
-; X86-NEXT: .LBB5_1: # %res_block
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: incl %eax
+; X86-NEXT: .LBB5_2: # %res_block
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: incl %ecx
; X86-NEXT: .LBB5_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length3_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: cmpw (%rsi), %ax
-; X64-NEXT: jne .LBB5_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB5_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movb 2(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 2(%rsi), %cl
; X64-NEXT: je .LBB5_3
-; X64-NEXT: .LBB5_1: # %res_block
+; X64-NEXT: .LBB5_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB5_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -314,36 +314,36 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length5_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB10_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 4(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 4(%ecx), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB10_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movb 4(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 4(%eax), %dl
; X86-NEXT: je .LBB10_3
-; X86-NEXT: .LBB10_1: # %res_block
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: incl %eax
+; X86-NEXT: .LBB10_2: # %res_block
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: incl %ecx
; X86-NEXT: .LBB10_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length5_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: cmpl (%rsi), %eax
-; X64-NEXT: jne .LBB10_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB10_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movb 4(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 4(%rsi), %cl
; X64-NEXT: je .LBB10_3
-; X64-NEXT: .LBB10_1: # %res_block
+; X64-NEXT: .LBB10_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB10_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -356,7 +356,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize {
define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length8:
-; X86: # BB#0: # %loadbb
+; X86: # BB#0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -365,8 +365,8 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: jne .LBB11_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB11_2
+; X86-NEXT: # BB#1: # %loadbb1
; X86-NEXT: movl 4(%esi), %ecx
; X86-NEXT: movl 4(%eax), %edx
; X86-NEXT: bswapl %ecx
@@ -374,7 +374,7 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: je .LBB11_3
-; X86-NEXT: .LBB11_1: # %res_block
+; X86-NEXT: .LBB11_2: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: setae %al
@@ -400,22 +400,22 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length8_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB12_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movl 4(%eax), %edx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpl 4(%ecx), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB12_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movl 4(%ecx), %edx
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpl 4(%eax), %edx
; X86-NEXT: je .LBB12_3
-; X86-NEXT: .LBB12_1: # %res_block
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: incl %eax
+; X86-NEXT: .LBB12_2: # %res_block
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: incl %ecx
; X86-NEXT: .LBB12_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: sete %al
; X86-NEXT: retl
;
@@ -432,15 +432,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize {
define i1 @length8_eq_const(i8* %X) nounwind optsize {
; X86-LABEL: length8_eq_const:
-; X86: # BB#0: # %loadbb
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130
-; X86-NEXT: jne .LBB13_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB13_2
+; X86-NEXT: # BB#1: # %loadbb1
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534
; X86-NEXT: je .LBB13_3
-; X86-NEXT: .LBB13_1: # %res_block
+; X86-NEXT: .LBB13_2: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: incl %eax
; X86-NEXT: .LBB13_3: # %endblock
@@ -473,16 +473,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length12_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: cmpq (%rsi), %rax
-; X64-NEXT: jne .LBB14_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB14_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpl 8(%rsi), %ecx
; X64-NEXT: je .LBB14_3
-; X64-NEXT: .LBB14_1: # %res_block
+; X64-NEXT: .LBB14_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB14_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -505,28 +505,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length12:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB15_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB15_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: movl 8(%rsi), %edx
; X64-NEXT: bswapl %ecx
; X64-NEXT: bswapl %edx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB15_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB15_1: # %res_block
+; X64-NEXT: je .LBB15_3
+; X64-NEXT: .LBB15_2: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB15_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
ret i32 %m
@@ -546,28 +545,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length16:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB16_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB16_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rcx
; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB16_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB16_1: # %res_block
+; X64-NEXT: je .LBB16_3
+; X64-NEXT: .LBB16_2: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB16_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
ret i32 %m
@@ -701,19 +699,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
; X86-NEXT: retl
;
; X64-SSE2-LABEL: length24_eq:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB20_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB20_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movq 16(%rdi), %rcx
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx
; X64-SSE2-NEXT: je .LBB20_3
-; X64-SSE2-NEXT: .LBB20_1: # %res_block
+; X64-SSE2-NEXT: .LBB20_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB20_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -721,18 +719,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length24_eq:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX2-NEXT: jne .LBB20_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB20_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: movq 16(%rdi), %rcx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx
; X64-AVX2-NEXT: je .LBB20_3
-; X64-AVX2-NEXT: .LBB20_1: # %res_block
+; X64-AVX2-NEXT: .LBB20_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB20_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
@@ -757,18 +755,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize {
; X86-NEXT: retl
;
; X64-SSE2-LABEL: length24_eq_const:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB21_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB21_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi)
; X64-SSE2-NEXT: je .LBB21_3
-; X64-SSE2-NEXT: .LBB21_1: # %res_block
+; X64-SSE2-NEXT: .LBB21_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB21_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -776,18 +774,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length24_eq_const:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX2-NEXT: jne .LBB21_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB21_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi)
; X64-AVX2-NEXT: je .LBB21_3
-; X64-AVX2-NEXT: .LBB21_1: # %res_block
+; X64-AVX2-NEXT: .LBB21_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB21_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
@@ -833,7 +831,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
; X86-NOSSE-NEXT: retl
;
; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2: # BB#0: # %loadbb
+; X86-SSE2: # BB#0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
@@ -841,8 +839,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X86-SSE2-NEXT: pmovmskb %xmm1, %edx
; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF
-; X86-SSE2-NEXT: jne .LBB23_1
-; X86-SSE2-NEXT: # BB#2: # %loadbb1
+; X86-SSE2-NEXT: jne .LBB23_2
+; X86-SSE2-NEXT: # BB#1: # %loadbb1
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
@@ -850,7 +848,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X86-SSE2-NEXT: je .LBB23_3
-; X86-SSE2-NEXT: .LBB23_1: # %res_block
+; X86-SSE2-NEXT: .LBB23_2: # %res_block
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: incl %eax
; X86-SSE2-NEXT: .LBB23_3: # %endblock
@@ -859,14 +857,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB23_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB23_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
@@ -874,7 +872,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-SSE2-NEXT: je .LBB23_3
-; X64-SSE2-NEXT: .LBB23_1: # %res_block
+; X64-SSE2-NEXT: .LBB23_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB23_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -909,21 +907,21 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize {
; X86-NOSSE-NEXT: retl
;
; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2: # BB#0: # %loadbb
+; X86-SSE2: # BB#0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
-; X86-SSE2-NEXT: jne .LBB24_1
-; X86-SSE2-NEXT: # BB#2: # %loadbb1
+; X86-SSE2-NEXT: jne .LBB24_2
+; X86-SSE2-NEXT: # BB#1: # %loadbb1
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X86-SSE2-NEXT: je .LBB24_3
-; X86-SSE2-NEXT: .LBB24_1: # %res_block
+; X86-SSE2-NEXT: .LBB24_2: # %res_block
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: incl %eax
; X86-SSE2-NEXT: .LBB24_3: # %endblock
@@ -932,20 +930,20 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize {
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB24_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB24_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-SSE2-NEXT: je .LBB24_3
-; X64-SSE2-NEXT: .LBB24_1: # %res_block
+; X64-SSE2-NEXT: .LBB24_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB24_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -1009,20 +1007,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length64_eq:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
; X64-AVX2-NEXT: cmpl $-1, %eax
-; X64-AVX2-NEXT: jne .LBB26_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB26_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: cmpl $-1, %ecx
; X64-AVX2-NEXT: je .LBB26_3
-; X64-AVX2-NEXT: .LBB26_1: # %res_block
+; X64-AVX2-NEXT: .LBB26_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB26_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
@@ -1059,20 +1057,20 @@ define i1 @length64_eq_const(i8* %X) nounwind optsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length64_eq_const:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
; X64-AVX2-NEXT: cmpl $-1, %eax
-; X64-AVX2-NEXT: jne .LBB27_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB27_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: cmpl $-1, %ecx
; X64-AVX2-NEXT: je .LBB27_3
-; X64-AVX2-NEXT: .LBB27_1: # %res_block
+; X64-AVX2-NEXT: .LBB27_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB27_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index 393e4c42d8b..84fd45b0a08 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -187,35 +187,35 @@ define i32 @length3(i8* %X, i8* %Y) nounwind {
define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length3_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzwl (%eax), %edx
-; X86-NEXT: cmpw (%ecx), %dx
-; X86-NEXT: jne .LBB7_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 2(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 2(%ecx), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %edx
+; X86-NEXT: cmpw (%eax), %dx
+; X86-NEXT: jne .LBB7_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movb 2(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 2(%eax), %dl
; X86-NEXT: je .LBB7_3
-; X86-NEXT: .LBB7_1: # %res_block
-; X86-NEXT: movl $1, %eax
+; X86-NEXT: .LBB7_2: # %res_block
+; X86-NEXT: movl $1, %ecx
; X86-NEXT: .LBB7_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length3_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: cmpw (%rsi), %ax
-; X64-NEXT: jne .LBB7_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB7_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movb 2(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 2(%rsi), %cl
; X64-NEXT: je .LBB7_3
-; X64-NEXT: .LBB7_1: # %res_block
+; X64-NEXT: .LBB7_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB7_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -344,35 +344,35 @@ define i32 @length5(i8* %X, i8* %Y) nounwind {
define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length5_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB12_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 4(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 4(%ecx), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB12_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movb 4(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 4(%eax), %dl
; X86-NEXT: je .LBB12_3
-; X86-NEXT: .LBB12_1: # %res_block
-; X86-NEXT: movl $1, %eax
+; X86-NEXT: .LBB12_2: # %res_block
+; X86-NEXT: movl $1, %ecx
; X86-NEXT: .LBB12_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length5_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: cmpl (%rsi), %eax
-; X64-NEXT: jne .LBB12_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB12_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movb 4(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 4(%rsi), %cl
; X64-NEXT: je .LBB12_3
-; X64-NEXT: .LBB12_1: # %res_block
+; X64-NEXT: .LBB12_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB12_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -385,7 +385,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
define i32 @length8(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length8:
-; X86: # BB#0: # %loadbb
+; X86: # BB#0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -394,23 +394,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: jne .LBB13_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB13_2
+; X86-NEXT: # BB#1: # %loadbb1
; X86-NEXT: movl 4(%esi), %ecx
; X86-NEXT: movl 4(%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: jne .LBB13_1
-; X86-NEXT: # BB#3: # %endblock
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-; X86-NEXT: .LBB13_1: # %res_block
+; X86-NEXT: je .LBB13_3
+; X86-NEXT: .LBB13_2: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: setae %al
; X86-NEXT: leal -1(%eax,%eax), %eax
+; X86-NEXT: .LBB13_3: # %endblock
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
@@ -431,21 +429,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length8_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB14_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movl 4(%eax), %edx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpl 4(%ecx), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB14_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movl 4(%ecx), %edx
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpl 4(%eax), %edx
; X86-NEXT: je .LBB14_3
-; X86-NEXT: .LBB14_1: # %res_block
-; X86-NEXT: movl $1, %eax
+; X86-NEXT: .LBB14_2: # %res_block
+; X86-NEXT: movl $1, %ecx
; X86-NEXT: .LBB14_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: sete %al
; X86-NEXT: retl
;
@@ -462,15 +460,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
define i1 @length8_eq_const(i8* %X) nounwind {
; X86-LABEL: length8_eq_const:
-; X86: # BB#0: # %loadbb
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130
-; X86-NEXT: jne .LBB15_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB15_2
+; X86-NEXT: # BB#1: # %loadbb1
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534
; X86-NEXT: je .LBB15_3
-; X86-NEXT: .LBB15_1: # %res_block
+; X86-NEXT: .LBB15_2: # %res_block
; X86-NEXT: movl $1, %eax
; X86-NEXT: .LBB15_3: # %endblock
; X86-NEXT: testl %eax, %eax
@@ -502,16 +500,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length12_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: cmpq (%rsi), %rax
-; X64-NEXT: jne .LBB16_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB16_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpl 8(%rsi), %ecx
; X64-NEXT: je .LBB16_3
-; X64-NEXT: .LBB16_1: # %res_block
+; X64-NEXT: .LBB16_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB16_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -534,28 +532,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length12:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB17_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB17_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: movl 8(%rsi), %edx
; X64-NEXT: bswapl %ecx
; X64-NEXT: bswapl %edx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB17_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB17_1: # %res_block
+; X64-NEXT: je .LBB17_3
+; X64-NEXT: .LBB17_2: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB17_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
ret i32 %m
@@ -575,28 +572,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length16:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB18_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB18_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rcx
; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB18_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB18_1: # %res_block
+; X64-NEXT: je .LBB18_3
+; X64-NEXT: .LBB18_2: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB18_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
ret i32 %m
@@ -754,19 +750,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind {
; X86-NEXT: retl
;
; X64-SSE2-LABEL: length24_eq:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB22_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB22_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movq 16(%rdi), %rcx
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx
; X64-SSE2-NEXT: je .LBB22_3
-; X64-SSE2-NEXT: .LBB22_1: # %res_block
+; X64-SSE2-NEXT: .LBB22_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB22_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -774,18 +770,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind {
; X64-SSE2-NEXT: retq
;
; X64-AVX-LABEL: length24_eq:
-; X64-AVX: # BB#0: # %loadbb
+; X64-AVX: # BB#0:
; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX-NEXT: jne .LBB22_1
-; X64-AVX-NEXT: # BB#2: # %loadbb1
+; X64-AVX-NEXT: jne .LBB22_2
+; X64-AVX-NEXT: # BB#1: # %loadbb1
; X64-AVX-NEXT: movq 16(%rdi), %rcx
; X64-AVX-NEXT: xorl %eax, %eax
; X64-AVX-NEXT: cmpq 16(%rsi), %rcx
; X64-AVX-NEXT: je .LBB22_3
-; X64-AVX-NEXT: .LBB22_1: # %res_block
+; X64-AVX-NEXT: .LBB22_2: # %res_block
; X64-AVX-NEXT: movl $1, %eax
; X64-AVX-NEXT: .LBB22_3: # %endblock
; X64-AVX-NEXT: testl %eax, %eax
@@ -810,18 +806,18 @@ define i1 @length24_eq_const(i8* %X) nounwind {
; X86-NEXT: retl
;
; X64-SSE2-LABEL: length24_eq_const:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB23_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB23_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi)
; X64-SSE2-NEXT: je .LBB23_3
-; X64-SSE2-NEXT: .LBB23_1: # %res_block
+; X64-SSE2-NEXT: .LBB23_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB23_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -829,18 +825,18 @@ define i1 @length24_eq_const(i8* %X) nounwind {
; X64-SSE2-NEXT: retq
;
; X64-AVX-LABEL: length24_eq_const:
-; X64-AVX: # BB#0: # %loadbb
+; X64-AVX: # BB#0:
; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX-NEXT: jne .LBB23_1
-; X64-AVX-NEXT: # BB#2: # %loadbb1
+; X64-AVX-NEXT: jne .LBB23_2
+; X64-AVX-NEXT: # BB#1: # %loadbb1
; X64-AVX-NEXT: xorl %eax, %eax
; X64-AVX-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
; X64-AVX-NEXT: cmpq %rcx, 16(%rdi)
; X64-AVX-NEXT: je .LBB23_3
-; X64-AVX-NEXT: .LBB23_1: # %res_block
+; X64-AVX-NEXT: .LBB23_2: # %res_block
; X64-AVX-NEXT: movl $1, %eax
; X64-AVX-NEXT: .LBB23_3: # %endblock
; X64-AVX-NEXT: testl %eax, %eax
@@ -898,7 +894,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2: # BB#0: # %loadbb
+; X86-SSE2: # BB#0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
@@ -906,8 +902,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X86-SSE2-NEXT: pmovmskb %xmm1, %edx
; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF
-; X86-SSE2-NEXT: jne .LBB25_1
-; X86-SSE2-NEXT: # BB#2: # %loadbb1
+; X86-SSE2-NEXT: jne .LBB25_2
+; X86-SSE2-NEXT: # BB#1: # %loadbb1
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
@@ -915,7 +911,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X86-SSE2-NEXT: je .LBB25_3
-; X86-SSE2-NEXT: .LBB25_1: # %res_block
+; X86-SSE2-NEXT: .LBB25_2: # %res_block
; X86-SSE2-NEXT: movl $1, %eax
; X86-SSE2-NEXT: .LBB25_3: # %endblock
; X86-SSE2-NEXT: testl %eax, %eax
@@ -923,14 +919,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB25_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB25_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
@@ -938,7 +934,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-SSE2-NEXT: je .LBB25_3
-; X64-SSE2-NEXT: .LBB25_1: # %res_block
+; X64-SSE2-NEXT: .LBB25_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB25_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -946,20 +942,20 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X64-SSE2-NEXT: retq
;
; X64-AVX1-LABEL: length32_eq:
-; X64-AVX1: # BB#0: # %loadbb
+; X64-AVX1: # BB#0:
; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX1-NEXT: jne .LBB25_1
-; X64-AVX1-NEXT: # BB#2: # %loadbb1
+; X64-AVX1-NEXT: jne .LBB25_2
+; X64-AVX1-NEXT: # BB#1: # %loadbb1
; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0
; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm0, %xmm0
; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx
; X64-AVX1-NEXT: xorl %eax, %eax
; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-AVX1-NEXT: je .LBB25_3
-; X64-AVX1-NEXT: .LBB25_1: # %res_block
+; X64-AVX1-NEXT: .LBB25_2: # %res_block
; X64-AVX1-NEXT: movl $1, %eax
; X64-AVX1-NEXT: .LBB25_3: # %endblock
; X64-AVX1-NEXT: testl %eax, %eax
@@ -1006,21 +1002,21 @@ define i1 @length32_eq_const(i8* %X) nounwind {
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2: # BB#0: # %loadbb
+; X86-SSE2: # BB#0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
-; X86-SSE2-NEXT: jne .LBB26_1
-; X86-SSE2-NEXT: # BB#2: # %loadbb1
+; X86-SSE2-NEXT: jne .LBB26_2
+; X86-SSE2-NEXT: # BB#1: # %loadbb1
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X86-SSE2-NEXT: je .LBB26_3
-; X86-SSE2-NEXT: .LBB26_1: # %res_block
+; X86-SSE2-NEXT: .LBB26_2: # %res_block
; X86-SSE2-NEXT: movl $1, %eax
; X86-SSE2-NEXT: .LBB26_3: # %endblock
; X86-SSE2-NEXT: testl %eax, %eax
@@ -1028,20 +1024,20 @@ define i1 @length32_eq_const(i8* %X) nounwind {
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB26_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB26_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-SSE2-NEXT: je .LBB26_3
-; X64-SSE2-NEXT: .LBB26_1: # %res_block
+; X64-SSE2-NEXT: .LBB26_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB26_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -1049,20 +1045,20 @@ define i1 @length32_eq_const(i8* %X) nounwind {
; X64-SSE2-NEXT: retq
;
; X64-AVX1-LABEL: length32_eq_const:
-; X64-AVX1: # BB#0: # %loadbb
+; X64-AVX1: # BB#0:
; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX1-NEXT: jne .LBB26_1
-; X64-AVX1-NEXT: # BB#2: # %loadbb1
+; X64-AVX1-NEXT: jne .LBB26_2
+; X64-AVX1-NEXT: # BB#1: # %loadbb1
; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0
; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx
; X64-AVX1-NEXT: xorl %eax, %eax
; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-AVX1-NEXT: je .LBB26_3
-; X64-AVX1-NEXT: .LBB26_1: # %res_block
+; X64-AVX1-NEXT: .LBB26_2: # %res_block
; X64-AVX1-NEXT: movl $1, %eax
; X64-AVX1-NEXT: .LBB26_3: # %endblock
; X64-AVX1-NEXT: testl %eax, %eax
@@ -1136,20 +1132,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind {
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: length64_eq:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
; X64-AVX2-NEXT: cmpl $-1, %eax
-; X64-AVX2-NEXT: jne .LBB28_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB28_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: cmpl $-1, %ecx
; X64-AVX2-NEXT: je .LBB28_3
-; X64-AVX2-NEXT: .LBB28_1: # %res_block
+; X64-AVX2-NEXT: .LBB28_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB28_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
@@ -1197,20 +1193,20 @@ define i1 @length64_eq_const(i8* %X) nounwind {
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: length64_eq_const:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
; X64-AVX2-NEXT: cmpl $-1, %eax
-; X64-AVX2-NEXT: jne .LBB29_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB29_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: cmpl $-1, %ecx
; X64-AVX2-NEXT: je .LBB29_3
-; X64-AVX2-NEXT: .LBB29_1: # %res_block
+; X64-AVX2-NEXT: .LBB29_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB29_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
diff --git a/test/CodeGen/X86/memset-nonzero.ll b/test/CodeGen/X86/memset-nonzero.ll
index f0a957c9417..98e09377ddb 100644
--- a/test/CodeGen/X86/memset-nonzero.ll
+++ b/test/CodeGen/X86/memset-nonzero.ll
@@ -148,6 +148,7 @@ define void @memset_256_nonzero_bytes(i8* %x) {
; SSE-NEXT: movl $256, %edx # imm = 0x100
; SSE-NEXT: callq memset
; SSE-NEXT: popq %rax
+; SSE-NEXT: .cfi_def_cfa_offset 8
; SSE-NEXT: retq
;
; SSE2FAST-LABEL: memset_256_nonzero_bytes:
diff --git a/test/CodeGen/X86/merge-consecutive-loads-128.ll b/test/CodeGen/X86/merge-consecutive-loads-128.ll
index e414f5554de..b909b7c403b 100644
--- a/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -72,7 +72,9 @@ define <2 x i64> @merge_2i64_i64_12(i64* %ptr) nounwind uwtable noinline ssp {
; X32-SSE1-NEXT: movl %esi, 4(%eax)
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_2i64_i64_12:
@@ -384,6 +386,7 @@ define <4 x i32> @merge_4i32_i32_23u5(i32* %ptr) nounwind uwtable noinline ssp {
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: movl %ecx, 12(%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_23u5:
@@ -435,7 +438,9 @@ define <4 x i32> @merge_4i32_i32_23u5_inc2(i32* %ptr) nounwind uwtable noinline
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: movl %ecx, 12(%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc2:
@@ -490,7 +495,9 @@ define <4 x i32> @merge_4i32_i32_23u5_inc3(i32* %ptr) nounwind uwtable noinline
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: movl %ecx, 12(%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc3:
@@ -649,7 +656,9 @@ define <4 x i32> @merge_4i32_i32_45zz_inc4(i32* %ptr) nounwind uwtable noinline
; X32-SSE1-NEXT: movl $0, 12(%eax)
; X32-SSE1-NEXT: movl $0, 8(%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc4:
@@ -701,7 +710,9 @@ define <4 x i32> @merge_4i32_i32_45zz_inc5(i32* %ptr) nounwind uwtable noinline
; X32-SSE1-NEXT: movl $0, 12(%eax)
; X32-SSE1-NEXT: movl $0, 8(%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc5:
@@ -751,7 +762,9 @@ define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline s
; X32-SSE1-NEXT: movl %esi, 6(%eax)
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_8i16_i16_23u567u9:
@@ -897,9 +910,13 @@ define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(i8* %ptr) nounwind uwtable noin
; X32-SSE1-NEXT: movl %esi, 3(%eax)
; X32-SSE1-NEXT: movw %bp, (%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 16
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
; X32-SSE1-NEXT: popl %ebx
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %ebp
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF:
@@ -1129,7 +1146,9 @@ define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinlin
; X32-SSE1-NEXT: movl %esi, 4(%eax)
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_2i64_i64_12_volatile:
diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll
index 051c8a710c8..ddcc383b65e 100644
--- a/test/CodeGen/X86/movtopush.ll
+++ b/test/CodeGen/X86/movtopush.ll
@@ -382,8 +382,10 @@ entry:
; LINUX: pushl $1
; LINUX: .cfi_adjust_cfa_offset 4
; LINUX: calll good
-; LINUX: addl $28, %esp
+; LINUX: addl $16, %esp
; LINUX: .cfi_adjust_cfa_offset -16
+; LINUX: addl $12, %esp
+; LINUX: .cfi_def_cfa_offset 4
; LINUX-NOT: add
; LINUX: retl
define void @pr27140() optsize {
diff --git a/test/CodeGen/X86/mul-constant-result.ll b/test/CodeGen/X86/mul-constant-result.ll
index 011b63ce726..f778397f889 100644
--- a/test/CodeGen/X86/mul-constant-result.ll
+++ b/test/CodeGen/X86/mul-constant-result.ll
@@ -34,84 +34,116 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: .LBB0_6:
; X86-NEXT: addl %eax, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_39:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: .LBB0_40:
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_7:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_8:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: shll $2, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_9:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_10:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_11:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (,%eax,8), %ecx
; X86-NEXT: jmp .LBB0_12
; X86-NEXT: .LBB0_13:
; X86-NEXT: shll $3, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_14:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_15:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_16:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_17:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: shll $2, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_18:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_19:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: jmp .LBB0_20
; X86-NEXT: .LBB0_21:
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_22:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: shll $4, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_23:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: shll $4, %ecx
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_24:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_25:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: shll $2, %ecx
; X86-NEXT: jmp .LBB0_12
@@ -119,20 +151,26 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: shll $2, %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_27:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_28:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: .LBB0_20:
; X86-NEXT: leal (%eax,%ecx,4), %ecx
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_29:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: shll $3, %ecx
; X86-NEXT: jmp .LBB0_12
@@ -140,13 +178,17 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: shll $3, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_31:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_32:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %ecx
; X86-NEXT: jmp .LBB0_12
@@ -154,21 +196,27 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_34:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %ecx
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_35:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %ecx
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_36:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: shll $5, %ecx
; X86-NEXT: subl %eax, %ecx
@@ -180,10 +228,13 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_38:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: shll $5, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-HSW-LABEL: mult:
@@ -857,8 +908,11 @@ define i32 @foo() local_unnamed_addr #0 {
; X86-NEXT: negl %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 12
; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %ebx
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-HSW-LABEL: foo:
@@ -1072,10 +1126,15 @@ define i32 @foo() local_unnamed_addr #0 {
; X64-HSW-NEXT: negl %ecx
; X64-HSW-NEXT: movl %ecx, %eax
; X64-HSW-NEXT: addq $8, %rsp
+; X64-HSW-NEXT: .cfi_def_cfa_offset 40
; X64-HSW-NEXT: popq %rbx
+; X64-HSW-NEXT: .cfi_def_cfa_offset 32
; X64-HSW-NEXT: popq %r14
+; X64-HSW-NEXT: .cfi_def_cfa_offset 24
; X64-HSW-NEXT: popq %r15
+; X64-HSW-NEXT: .cfi_def_cfa_offset 16
; X64-HSW-NEXT: popq %rbp
+; X64-HSW-NEXT: .cfi_def_cfa_offset 8
; X64-HSW-NEXT: retq
%1 = tail call i32 @mult(i32 1, i32 0)
%2 = icmp ne i32 %1, 1
diff --git a/test/CodeGen/X86/mul-i256.ll b/test/CodeGen/X86/mul-i256.ll
index 0a48ae761ec..1e05b95dda0 100644
--- a/test/CodeGen/X86/mul-i256.ll
+++ b/test/CodeGen/X86/mul-i256.ll
@@ -349,10 +349,15 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 {
; X32-NEXT: movl %eax, 24(%ecx)
; X32-NEXT: movl %edx, 28(%ecx)
; X32-NEXT: addl $88, %esp
+; X32-NEXT: .cfi_def_cfa_offset 20
; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 16
; X32-NEXT: popl %edi
+; X32-NEXT: .cfi_def_cfa_offset 12
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: popl %ebp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test:
@@ -421,8 +426,11 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 {
; X64-NEXT: movq %rax, 16(%r9)
; X64-NEXT: movq %rdx, 24(%r9)
; X64-NEXT: popq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 24
; X64-NEXT: popq %r14
+; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: popq %r15
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
%av = load i256, i256* %a
diff --git a/test/CodeGen/X86/mul128.ll b/test/CodeGen/X86/mul128.ll
index 70a6173a19f..0c11f17d8d1 100644
--- a/test/CodeGen/X86/mul128.ll
+++ b/test/CodeGen/X86/mul128.ll
@@ -86,10 +86,15 @@ define i128 @foo(i128 %t, i128 %u) {
; X86-NEXT: movl %edx, 12(%ecx)
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: addl $8, %esp
+; X86-NEXT: .cfi_def_cfa_offset 20
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 12
; X86-NEXT: popl %ebx
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl $4
%k = mul i128 %t, %u
ret i128 %k
diff --git a/test/CodeGen/X86/no-plt.ll b/test/CodeGen/X86/no-plt.ll
new file mode 100644
index 00000000000..d6383c2d7d1
--- /dev/null
+++ b/test/CodeGen/X86/no-plt.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \
+; RUN: | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu \
+; RUN: | FileCheck -check-prefix=X64 %s
+
+define i32 @main() #0 {
+; X64: callq *_Z3foov@GOTPCREL(%rip)
+; X64: callq _Z3barv
+; X64: callq _Z3bazv
+
+entry:
+ %retval = alloca i32, align 4
+ store i32 0, i32* %retval, align 4
+ %call1 = call i32 @_Z3foov()
+ %call2 = call i32 @_Z3barv()
+ %call3 = call i32 @_Z3bazv()
+ ret i32 0
+}
+
+; Function Attrs: nonlazybind
+declare i32 @_Z3foov() #1
+
+declare i32 @_Z3barv() #2
+
+; Function Attrs: nonlazybind
+declare hidden i32 @_Z3bazv() #3
+
+
+attributes #1 = { nonlazybind }
+attributes #3 = { nonlazybind }
diff --git a/test/CodeGen/X86/pop-stack-cleanup-msvc.ll b/test/CodeGen/X86/pop-stack-cleanup-msvc.ll
new file mode 100644
index 00000000000..6330d3de72f
--- /dev/null
+++ b/test/CodeGen/X86/pop-stack-cleanup-msvc.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s | FileCheck %s
+
+target triple = "i686--windows-msvc"
+
+declare { i8*, i32 } @param2_ret2(i32, i32)
+declare i32 @__CxxFrameHandler3(...)
+
+
+define void @test_reserved_regs() minsize optsize personality i32 (...)* @__CxxFrameHandler3 {
+; CHECK-LABEL: test_reserved_regs:
+; CHECK: calll _param2_ret2
+; CHECK-NEXT: popl %ecx
+; CHECK-NEXT: popl %edi
+start:
+ %s = alloca i64
+ store i64 4, i64* %s
+ %0 = invoke { i8*, i32 } @param2_ret2(i32 0, i32 1)
+ to label %out unwind label %cleanup
+
+out:
+ ret void
+
+cleanup:
+ %cp = cleanuppad within none []
+ cleanupret from %cp unwind to caller
+}
diff --git a/test/CodeGen/X86/pr21792.ll b/test/CodeGen/X86/pr21792.ll
index 74f6c5a361f..54eb1fc7272 100644
--- a/test/CodeGen/X86/pr21792.ll
+++ b/test/CodeGen/X86/pr21792.ll
@@ -28,6 +28,7 @@ define void @func(<4 x float> %vx) {
; CHECK-NEXT: leaq stuff+8(%r9), %r9
; CHECK-NEXT: callq toto
; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
entry:
%tmp2 = bitcast <4 x float> %vx to <2 x i64>
diff --git a/test/CodeGen/X86/pr29061.ll b/test/CodeGen/X86/pr29061.ll
index 0cbe75f9ad5..b62d082507d 100644
--- a/test/CodeGen/X86/pr29061.ll
+++ b/test/CodeGen/X86/pr29061.ll
@@ -15,6 +15,7 @@ define void @t1(i8 signext %c) {
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: popl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
entry:
tail call void asm sideeffect "", "{di},~{dirflag},~{fpsr},~{flags}"(i8 %c)
@@ -32,6 +33,7 @@ define void @t2(i8 signext %c) {
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
entry:
tail call void asm sideeffect "", "{si},~{dirflag},~{fpsr},~{flags}"(i8 %c)
diff --git a/test/CodeGen/X86/pr29112.ll b/test/CodeGen/X86/pr29112.ll
index cc670eeb978..d791936bd53 100644
--- a/test/CodeGen/X86/pr29112.ll
+++ b/test/CodeGen/X86/pr29112.ll
@@ -65,6 +65,7 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
; CHECK-NEXT: vaddps {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: addq $88, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
diff --git a/test/CodeGen/X86/pr30430.ll b/test/CodeGen/X86/pr30430.ll
index 0254c0940b8..06007a3a4cf 100644
--- a/test/CodeGen/X86/pr30430.ll
+++ b/test/CodeGen/X86/pr30430.ll
@@ -108,6 +108,7 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
; CHECK-NEXT: vmovss %xmm14, (%rsp) # 4-byte Spill
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: retq
entry:
%__A.addr.i = alloca float, align 4
diff --git a/test/CodeGen/X86/pr32241.ll b/test/CodeGen/X86/pr32241.ll
index f48fef5f7fb..02f3bb12291 100644
--- a/test/CodeGen/X86/pr32241.ll
+++ b/test/CodeGen/X86/pr32241.ll
@@ -50,7 +50,9 @@ define i32 @_Z3foov() {
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%esp)
; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: addl $16, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
entry:
%aa = alloca i16, align 2
diff --git a/test/CodeGen/X86/pr32256.ll b/test/CodeGen/X86/pr32256.ll
index f6e254aaad0..5b6126fbc76 100644
--- a/test/CodeGen/X86/pr32256.ll
+++ b/test/CodeGen/X86/pr32256.ll
@@ -27,6 +27,7 @@ define void @_Z1av() {
; CHECK-NEXT: andb $1, %al
; CHECK-NEXT: movb %al, {{[0-9]+}}(%esp)
; CHECK-NEXT: addl $2, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
entry:
%b = alloca i8, align 1
diff --git a/test/CodeGen/X86/pr32282.ll b/test/CodeGen/X86/pr32282.ll
index d6e6f6eb107..67a0332ac53 100644
--- a/test/CodeGen/X86/pr32282.ll
+++ b/test/CodeGen/X86/pr32282.ll
@@ -43,6 +43,7 @@ define void @foo() {
; X86-NEXT: orl %eax, %edx
; X86-NEXT: setne {{[0-9]+}}(%esp)
; X86-NEXT: popl %eax
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: foo:
diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll
index 11eb6968709..59be67f0579 100644
--- a/test/CodeGen/X86/pr32284.ll
+++ b/test/CodeGen/X86/pr32284.ll
@@ -71,6 +71,7 @@ define void @foo() {
; 686-O0-NEXT: movzbl %al, %ecx
; 686-O0-NEXT: movl %ecx, (%esp)
; 686-O0-NEXT: addl $8, %esp
+; 686-O0-NEXT: .cfi_def_cfa_offset 4
; 686-O0-NEXT: retl
;
; 686-LABEL: foo:
@@ -88,6 +89,7 @@ define void @foo() {
; 686-NEXT: setle %dl
; 686-NEXT: movl %edx, {{[0-9]+}}(%esp)
; 686-NEXT: addl $8, %esp
+; 686-NEXT: .cfi_def_cfa_offset 4
; 686-NEXT: retl
entry:
%a = alloca i8, align 1
@@ -232,10 +234,15 @@ define void @f1() {
; 686-O0-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
; 686-O0-NEXT: movl %esi, (%esp) # 4-byte Spill
; 686-O0-NEXT: addl $36, %esp
+; 686-O0-NEXT: .cfi_def_cfa_offset 20
; 686-O0-NEXT: popl %esi
+; 686-O0-NEXT: .cfi_def_cfa_offset 16
; 686-O0-NEXT: popl %edi
+; 686-O0-NEXT: .cfi_def_cfa_offset 12
; 686-O0-NEXT: popl %ebx
+; 686-O0-NEXT: .cfi_def_cfa_offset 8
; 686-O0-NEXT: popl %ebp
+; 686-O0-NEXT: .cfi_def_cfa_offset 4
; 686-O0-NEXT: retl
;
; 686-LABEL: f1:
@@ -277,8 +284,11 @@ define void @f1() {
; 686-NEXT: movl %eax, _ZN8struct_210member_2_0E
; 686-NEXT: movl $0, _ZN8struct_210member_2_0E+4
; 686-NEXT: addl $1, %esp
+; 686-NEXT: .cfi_def_cfa_offset 12
; 686-NEXT: popl %esi
+; 686-NEXT: .cfi_def_cfa_offset 8
; 686-NEXT: popl %edi
+; 686-NEXT: .cfi_def_cfa_offset 4
; 686-NEXT: retl
entry:
%a = alloca i8, align 1
@@ -392,8 +402,11 @@ define void @f2() {
; 686-O0-NEXT: movw %cx, %di
; 686-O0-NEXT: movw %di, (%eax)
; 686-O0-NEXT: addl $2, %esp
+; 686-O0-NEXT: .cfi_def_cfa_offset 12
; 686-O0-NEXT: popl %esi
+; 686-O0-NEXT: .cfi_def_cfa_offset 8
; 686-O0-NEXT: popl %edi
+; 686-O0-NEXT: .cfi_def_cfa_offset 4
; 686-O0-NEXT: retl
;
; 686-LABEL: f2:
@@ -414,6 +427,7 @@ define void @f2() {
; 686-NEXT: sete %dl
; 686-NEXT: movw %dx, (%eax)
; 686-NEXT: addl $2, %esp
+; 686-NEXT: .cfi_def_cfa_offset 4
; 686-NEXT: retl
entry:
%a = alloca i16, align 2
@@ -532,6 +546,7 @@ define void @f3() #0 {
; 686-O0-NEXT: popl %esi
; 686-O0-NEXT: popl %edi
; 686-O0-NEXT: popl %ebp
+; 686-O0-NEXT: .cfi_def_cfa %esp, 4
; 686-O0-NEXT: retl
;
; 686-LABEL: f3:
@@ -558,6 +573,7 @@ define void @f3() #0 {
; 686-NEXT: movl %ecx, var_46
; 686-NEXT: movl %ebp, %esp
; 686-NEXT: popl %ebp
+; 686-NEXT: .cfi_def_cfa %esp, 4
; 686-NEXT: retl
entry:
%a = alloca i64, align 8
diff --git a/test/CodeGen/X86/pr32329.ll b/test/CodeGen/X86/pr32329.ll
index f6bdade24c6..9d1bb90e824 100644
--- a/test/CodeGen/X86/pr32329.ll
+++ b/test/CodeGen/X86/pr32329.ll
@@ -57,9 +57,13 @@ define void @foo() local_unnamed_addr {
; X86-NEXT: imull %eax, %ebx
; X86-NEXT: movb %bl, var_218
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 12
; X86-NEXT: popl %ebx
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: foo:
diff --git a/test/CodeGen/X86/pr32345.ll b/test/CodeGen/X86/pr32345.ll
index f6802887e9e..2bdeca20731 100644
--- a/test/CodeGen/X86/pr32345.ll
+++ b/test/CodeGen/X86/pr32345.ll
@@ -84,6 +84,7 @@ define void @foo() {
; 6860-NEXT: popl %edi
; 6860-NEXT: popl %ebx
; 6860-NEXT: popl %ebp
+; 6860-NEXT: .cfi_def_cfa %esp, 4
; 6860-NEXT: retl
;
; X64-LABEL: foo:
@@ -127,6 +128,7 @@ define void @foo() {
; 686-NEXT: movb %dl, (%eax)
; 686-NEXT: movl %ebp, %esp
; 686-NEXT: popl %ebp
+; 686-NEXT: .cfi_def_cfa %esp, 4
; 686-NEXT: retl
bb:
%tmp = alloca i64, align 8
diff --git a/test/CodeGen/X86/pr32451.ll b/test/CodeGen/X86/pr32451.ll
index 67c0cb39f8c..5b7d1373d34 100644
--- a/test/CodeGen/X86/pr32451.ll
+++ b/test/CodeGen/X86/pr32451.ll
@@ -30,7 +30,9 @@ define i8** @japi1_convert_690(i8**, i8***, i32) {
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
; CHECK-NEXT: movl %eax, (%ecx)
; CHECK-NEXT: addl $16, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
top:
%3 = alloca i8***
diff --git a/test/CodeGen/X86/pr34088.ll b/test/CodeGen/X86/pr34088.ll
index 2049c5507c6..4d85722057f 100644
--- a/test/CodeGen/X86/pr34088.ll
+++ b/test/CodeGen/X86/pr34088.ll
@@ -27,6 +27,7 @@ define i32 @pr34088() local_unnamed_addr {
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %ebp, %esp
; CHECK-NEXT: popl %ebp
+; CHECK-NEXT: .cfi_def_cfa %esp, 4
; CHECK-NEXT: retl
entry:
%foo = alloca %struct.Foo, align 4
diff --git a/test/CodeGen/X86/pr34653.ll b/test/CodeGen/X86/pr34653.ll
new file mode 100644
index 00000000000..129dbcacc95
--- /dev/null
+++ b/test/CodeGen/X86/pr34653.ll
@@ -0,0 +1,210 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+avx512f -o - | FileCheck %s
+
+declare fastcc <38 x double> @test()
+
+define void @pr34653() {
+; CHECK-LABEL: pr34653:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-512, %rsp # imm = 0xFE00
+; CHECK-NEXT: subq $2048, %rsp # imm = 0x800
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: callq test
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT: vmovaps %xmm0, %xmm1
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm2
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vmovaps %xmm3, %xmm4
+; CHECK-NEXT: vmovaps %xmm2, %xmm5
+; CHECK-NEXT: vmovaps %xmm5, %xmm6
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm7
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm8
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm9
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm10
+; CHECK-NEXT: vextractf32x4 $3, %zmm10, %xmm11
+; CHECK-NEXT: vmovaps %xmm11, %xmm12
+; CHECK-NEXT: vextractf32x4 $2, %zmm10, %xmm13
+; CHECK-NEXT: vmovaps %xmm13, %xmm14
+; CHECK-NEXT: vmovaps %xmm10, %xmm15
+; CHECK-NEXT: vmovaps %xmm15, %xmm2
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $3, %zmm9, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $2, %zmm9, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm9, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $3, %zmm8, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $2, %zmm8, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm8, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $3, %zmm7, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $2, %zmm7, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm7, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm11 = xmm11[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm13[1,0]
+; CHECK-NEXT: # kill: %YMM10<def> %YMM10<kill> %ZMM10<kill>
+; CHECK-NEXT: vextractf128 $1, %ymm10, %xmm10
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm10, %xmm0
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm15 = xmm15[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: # kill: %YMM9<def> %YMM9<kill> %ZMM9<kill>
+; CHECK-NEXT: vextractf128 $1, %ymm9, %xmm9
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm9, %xmm0
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: # kill: %YMM8<def> %YMM8<kill> %ZMM8<kill>
+; CHECK-NEXT: vextractf128 $1, %ymm8, %xmm8
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm8, %xmm0
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: # kill: %YMM7<def> %YMM7<kill> %ZMM7<kill>
+; CHECK-NEXT: vextractf128 $1, %ymm7, %xmm7
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm7, %xmm0
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm8, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm13, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm1, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm14, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm2, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm4, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm9, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm10, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm15, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm11, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm3, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm6, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm5, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm12, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm7, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %v = call fastcc <38 x double> @test()
+ %v.0 = extractelement <38 x double> %v, i32 0
+ ret void
+}
+
diff --git a/test/CodeGen/X86/pr34657.ll b/test/CodeGen/X86/pr34657.ll
new file mode 100644
index 00000000000..a63bc2a08dd
--- /dev/null
+++ b/test/CodeGen/X86/pr34657.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw -o - | FileCheck %s
+
+define <112 x i8> @pr34657() local_unnamed_addr {
+; CHECK-LABEL: pr34657
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vmovups (%rax), %xmm0
+; CHECK-NEXT: vmovups (%rax), %ymm1
+; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; CHECK-NEXT: vmovups (%rax), %zmm2
+; CHECK-NEXT: vmovaps %ymm1, 64(%rdi)
+; CHECK-NEXT: vmovaps %zmm2, (%rdi)
+; CHECK-NEXT: vextractf32x4 $2, %zmm0, 96(%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %wide.vec51 = load <112 x i8>, <112 x i8>* undef, align 2
+ ret <112 x i8> %wide.vec51
+}
diff --git a/test/CodeGen/X86/pr9743.ll b/test/CodeGen/X86/pr9743.ll
index 73b3c7f835c..ac3d4575510 100644
--- a/test/CodeGen/X86/pr9743.ll
+++ b/test/CodeGen/X86/pr9743.ll
@@ -11,4 +11,5 @@ define void @f() {
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: ret
diff --git a/test/CodeGen/X86/push-cfi-debug.ll b/test/CodeGen/X86/push-cfi-debug.ll
index 7f438e306e4..01fa12e87d0 100644
--- a/test/CodeGen/X86/push-cfi-debug.ll
+++ b/test/CodeGen/X86/push-cfi-debug.ll
@@ -23,8 +23,10 @@ declare x86_stdcallcc void @stdfoo(i32, i32) #0
; CHECK: .cfi_adjust_cfa_offset 4
; CHECK: calll stdfoo
; CHECK: .cfi_adjust_cfa_offset -8
-; CHECK: addl $20, %esp
+; CHECK: addl $8, %esp
; CHECK: .cfi_adjust_cfa_offset -8
+; CHECK: addl $12, %esp
+; CHECK: .cfi_def_cfa_offset 4
define void @test1() #0 !dbg !4 {
entry:
tail call void @foo(i32 1, i32 2) #1, !dbg !10
diff --git a/test/CodeGen/X86/push-cfi-obj.ll b/test/CodeGen/X86/push-cfi-obj.ll
index 33291ec3318..2c9ec334027 100644
--- a/test/CodeGen/X86/push-cfi-obj.ll
+++ b/test/CodeGen/X86/push-cfi-obj.ll
@@ -12,7 +12,7 @@
; LINUX-NEXT: ]
; LINUX-NEXT: Address: 0x0
; LINUX-NEXT: Offset: 0x68
-; LINUX-NEXT: Size: 64
+; LINUX-NEXT: Size: 72
; LINUX-NEXT: Link: 0
; LINUX-NEXT: Info: 0
; LINUX-NEXT: AddressAlignment: 4
@@ -22,8 +22,9 @@
; LINUX-NEXT: SectionData (
; LINUX-NEXT: 0000: 1C000000 00000000 017A504C 5200017C |.........zPLR..||
; LINUX-NEXT: 0010: 08070000 00000000 1B0C0404 88010000 |................|
-; LINUX-NEXT: 0020: 1C000000 24000000 00000000 1D000000 |....$...........|
+; LINUX-NEXT: 0020: 24000000 24000000 00000000 1D000000 |$...$...........|
; LINUX-NEXT: 0030: 04000000 00410E08 8502420D 05432E10 |.....A....B..C..|
+; LINUX-NEXT: 0040: 540C0404 410C0508 |T...A...|
; LINUX-NEXT: )
declare i32 @__gxx_personality_v0(...)
@@ -35,7 +36,7 @@ entry:
to label %continue unwind label %cleanup
continue:
ret void
-cleanup:
+cleanup:
landingpad { i8*, i32 }
cleanup
ret void
diff --git a/test/CodeGen/X86/push-cfi.ll b/test/CodeGen/X86/push-cfi.ll
index 91e579a8391..44f8bf857c4 100644
--- a/test/CodeGen/X86/push-cfi.ll
+++ b/test/CodeGen/X86/push-cfi.ll
@@ -74,8 +74,9 @@ cleanup:
; LINUX-NEXT: pushl $1
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: call
-; LINUX-NEXT: addl $28, %esp
+; LINUX-NEXT: addl $16, %esp
; LINUX: .cfi_adjust_cfa_offset -16
+; LINUX: addl $12, %esp
; DARWIN-NOT: .cfi_escape
; DARWIN-NOT: pushl
define void @test2_nofp() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
index 0e9d149373b..296d165b3eb 100644
--- a/test/CodeGen/X86/recip-fastmath.ll
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -144,14 +144,14 @@ define float @f32_one_step(float %x) #1 {
;
; KNL-LABEL: f32_one_step:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: f32_one_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -257,7 +257,7 @@ define float @f32_two_step(float %x) #2 {
;
; KNL-LABEL: f32_two_step:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
@@ -268,7 +268,7 @@ define float @f32_two_step(float %x) #2 {
;
; SKX-LABEL: f32_two_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
@@ -416,7 +416,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
;
; SKX-LABEL: v4f32_one_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -533,7 +533,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
;
; SKX-LABEL: v4f32_two_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
@@ -691,7 +691,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
;
; SKX-LABEL: v8f32_one_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -821,7 +821,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
;
; SKX-LABEL: v8f32_two_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
diff --git a/test/CodeGen/X86/recip-fastmath2.ll b/test/CodeGen/X86/recip-fastmath2.ll
index a263e9d3b65..f6eeeec57f1 100644
--- a/test/CodeGen/X86/recip-fastmath2.ll
+++ b/test/CodeGen/X86/recip-fastmath2.ll
@@ -56,13 +56,13 @@ define float @f32_no_step_2(float %x) #3 {
;
; KNL-LABEL: f32_no_step_2:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: f32_no_step_2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast float 1234.0, %x
@@ -144,7 +144,7 @@ define float @f32_one_step_2(float %x) #1 {
;
; KNL-LABEL: f32_one_step_2:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
@@ -152,7 +152,7 @@ define float @f32_one_step_2(float %x) #1 {
;
; SKX-LABEL: f32_one_step_2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
@@ -243,7 +243,7 @@ define float @f32_one_step_2_divs(float %x) #1 {
;
; KNL-LABEL: f32_one_step_2_divs:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
@@ -252,7 +252,7 @@ define float @f32_one_step_2_divs(float %x) #1 {
;
; SKX-LABEL: f32_one_step_2_divs:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
@@ -368,7 +368,7 @@ define float @f32_two_step_2(float %x) #2 {
;
; KNL-LABEL: f32_two_step_2:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
@@ -380,7 +380,7 @@ define float @f32_two_step_2(float %x) #2 {
;
; SKX-LABEL: f32_two_step_2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
@@ -478,7 +478,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
;
; SKX-LABEL: v4f32_one_step2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
@@ -580,7 +580,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
;
; SKX-LABEL: v4f32_one_step_2_divs:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
@@ -708,7 +708,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
;
; SKX-LABEL: v4f32_two_step2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
@@ -814,7 +814,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
;
; SKX-LABEL: v8f32_one_step2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
@@ -925,7 +925,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
;
; SKX-LABEL: v8f32_one_step_2_divs:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [11:0.50]
@@ -1067,7 +1067,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
;
; SKX-LABEL: v8f32_two_step2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
@@ -1124,7 +1124,7 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
;
; SKX-LABEL: v8f32_no_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
@@ -1183,7 +1183,7 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
;
; SKX-LABEL: v8f32_no_step2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
diff --git a/test/CodeGen/X86/return-ext.ll b/test/CodeGen/X86/return-ext.ll
index ef160f43b4a..c66e518943a 100644
--- a/test/CodeGen/X86/return-ext.ll
+++ b/test/CodeGen/X86/return-ext.ll
@@ -106,6 +106,7 @@ entry:
; CHECK: call
; CHECK-NEXT: movzbl
; CHECK-NEXT: {{pop|add}}
+; CHECK-NEXT: .cfi_def_cfa_offset {{4|8}}
; CHECK-NEXT: ret
}
@@ -120,6 +121,7 @@ entry:
; CHECK: call
; CHECK-NEXT: movzbl
; CHECK-NEXT: {{pop|add}}
+; CHECK-NEXT: .cfi_def_cfa_offset {{4|8}}
; CHECK-NEXT: ret
}
@@ -134,5 +136,6 @@ entry:
; CHECK: call
; CHECK-NEXT: movzwl
; CHECK-NEXT: {{pop|add}}
+; CHECK-NEXT: .cfi_def_cfa_offset {{4|8}}
; CHECK-NEXT: ret
}
diff --git a/test/CodeGen/X86/rtm.ll b/test/CodeGen/X86/rtm.ll
index bd2d3e544bd..a1feeb5999b 100644
--- a/test/CodeGen/X86/rtm.ll
+++ b/test/CodeGen/X86/rtm.ll
@@ -75,6 +75,7 @@ define void @f2(i32 %x) nounwind uwtable {
; X64-NEXT: xabort $1
; X64-NEXT: callq f1
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
%x.addr = alloca i32, align 4
diff --git a/test/CodeGen/X86/schedule-x86_32.ll b/test/CodeGen/X86/schedule-x86_32.ll
new file mode 100644
index 00000000000..5dc06e61cc6
--- /dev/null
+++ b/test/CodeGen/X86/schedule-x86_32.ll
@@ -0,0 +1,348 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=i686 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i8 @test_aaa(i8 %a0) optsize {
+; GENERIC-LABEL: test_aaa:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aaa
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aaa:
+; ATOM: # BB#0:
+; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aaa
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aaa:
+; SLM: # BB#0:
+; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aaa
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aaa:
+; SANDY: # BB#0:
+; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aaa
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aaa:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aaa
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [5:0.50]
+;
+; BROADWELL-LABEL: test_aaa:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aaa
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aaa:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aaa
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aaa:
+; SKX: # BB#0:
+; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aaa
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aaa:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aaa
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aaa:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aaa
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i8 asm "aaa", "=r,r"(i8 %a0) nounwind
+ ret i8 %1
+}
+
+define i8 @test_aad(i16 %a0) optsize {
+; GENERIC-LABEL: test_aad:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aad
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aad:
+; ATOM: # BB#0:
+; ATOM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aad
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aad:
+; SLM: # BB#0:
+; SLM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aad
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aad:
+; SANDY: # BB#0:
+; SANDY-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aad
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aad:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aad
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [5:0.50]
+;
+; BROADWELL-LABEL: test_aad:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aad
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aad:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aad
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aad:
+; SKX: # BB#0:
+; SKX-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aad
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aad:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aad
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aad:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aad
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i8 asm "aad", "=r,r"(i16 %a0) nounwind
+ ret i8 %1
+}
+
+define i16 @test_aam(i8 %a0) optsize {
+; GENERIC-LABEL: test_aam:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aam
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aam:
+; ATOM: # BB#0:
+; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aam
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aam:
+; SLM: # BB#0:
+; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aam
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aam:
+; SANDY: # BB#0:
+; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aam
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aam:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aam
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [5:0.50]
+;
+; BROADWELL-LABEL: test_aam:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aam
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aam:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aam
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aam:
+; SKX: # BB#0:
+; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aam
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aam:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aam
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aam:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aam
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i16 asm "aam", "=r,r"(i8 %a0) nounwind
+ ret i16 %1
+}
+
+define i8 @test_aas(i8 %a0) optsize {
+; GENERIC-LABEL: test_aas:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aas
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aas:
+; ATOM: # BB#0:
+; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aas
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aas:
+; SLM: # BB#0:
+; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aas
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aas:
+; SANDY: # BB#0:
+; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aas
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aas:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aas
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [5:0.50]
+;
+; BROADWELL-LABEL: test_aas:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aas
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aas:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aas
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aas:
+; SKX: # BB#0:
+; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aas
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aas:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aas
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aas:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aas
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i8 asm "aas", "=r,r"(i8 %a0) nounwind
+ ret i8 %1
+}
diff --git a/test/CodeGen/X86/schedule-x86_64.ll b/test/CodeGen/X86/schedule-x86_64.ll
new file mode 100644
index 00000000000..1db8c8768bd
--- /dev/null
+++ b/test/CodeGen/X86/schedule-x86_64.ll
@@ -0,0 +1,737 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i16 @test_bsf16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_bsf16:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsfw %di, %ax
+; GENERIC-NEXT: bsfw (%rsi), %cx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsf16:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsfw %di, %ax
+; ATOM-NEXT: bsfw (%rsi), %cx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsf16:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsfw %di, %ax
+; SLM-NEXT: bsfw (%rsi), %cx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsf16:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsfw %di, %ax
+; SANDY-NEXT: bsfw (%rsi), %cx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsf16:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsfw %di, %ax
+; HASWELL-NEXT: bsfw (%rsi), %cx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsf16:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsfw %di, %ax
+; BROADWELL-NEXT: bsfw (%rsi), %cx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsf16:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsfw %di, %ax
+; SKYLAKE-NEXT: bsfw (%rsi), %cx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsf16:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsfw %di, %ax
+; SKX-NEXT: bsfw (%rsi), %cx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsf16:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsfw %di, %ax
+; BTVER2-NEXT: bsfw (%rsi), %cx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsf16:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsfw %di, %ax
+; ZNVER1-NEXT: bsfw (%rsi), %cx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i16, i16 } asm sideeffect "bsf $2, $0 \0A\09 bsf $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i16 %a0, i16* %a1)
+ %2 = extractvalue { i16, i16 } %1, 0
+ %3 = extractvalue { i16, i16 } %1, 1
+ %4 = or i16 %2, %3
+ ret i16 %4
+}
+define i32 @test_bsf32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_bsf32:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsfl %edi, %eax
+; GENERIC-NEXT: bsfl (%rsi), %ecx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsf32:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsfl %edi, %eax
+; ATOM-NEXT: bsfl (%rsi), %ecx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsf32:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsfl %edi, %eax
+; SLM-NEXT: bsfl (%rsi), %ecx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsf32:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsfl %edi, %eax
+; SANDY-NEXT: bsfl (%rsi), %ecx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsf32:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsfl %edi, %eax
+; HASWELL-NEXT: bsfl (%rsi), %ecx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsf32:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsfl %edi, %eax
+; BROADWELL-NEXT: bsfl (%rsi), %ecx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsf32:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsfl %edi, %eax
+; SKYLAKE-NEXT: bsfl (%rsi), %ecx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsf32:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsfl %edi, %eax
+; SKX-NEXT: bsfl (%rsi), %ecx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsf32:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsfl %edi, %eax
+; BTVER2-NEXT: bsfl (%rsi), %ecx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsf32:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsfl %edi, %eax
+; ZNVER1-NEXT: bsfl (%rsi), %ecx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i32, i32 } asm sideeffect "bsf $2, $0 \0A\09 bsf $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i32 %a0, i32* %a1)
+ %2 = extractvalue { i32, i32 } %1, 0
+ %3 = extractvalue { i32, i32 } %1, 1
+ %4 = or i32 %2, %3
+ ret i32 %4
+}
+define i64 @test_bsf64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_bsf64:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsfq %rdi, %rax
+; GENERIC-NEXT: bsfq (%rsi), %rcx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsf64:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsfq %rdi, %rax
+; ATOM-NEXT: bsfq (%rsi), %rcx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsf64:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsfq %rdi, %rax
+; SLM-NEXT: bsfq (%rsi), %rcx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsf64:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsfq %rdi, %rax
+; SANDY-NEXT: bsfq (%rsi), %rcx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsf64:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsfq %rdi, %rax
+; HASWELL-NEXT: bsfq (%rsi), %rcx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsf64:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsfq %rdi, %rax
+; BROADWELL-NEXT: bsfq (%rsi), %rcx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsf64:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsfq %rdi, %rax
+; SKYLAKE-NEXT: bsfq (%rsi), %rcx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsf64:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsfq %rdi, %rax
+; SKX-NEXT: bsfq (%rsi), %rcx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsf64:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsfq %rdi, %rax
+; BTVER2-NEXT: bsfq (%rsi), %rcx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsf64:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsfq %rdi, %rax
+; ZNVER1-NEXT: bsfq (%rsi), %rcx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i64, i64 } asm sideeffect "bsf $2, $0 \0A\09 bsf $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i64 %a0, i64* %a1)
+ %2 = extractvalue { i64, i64 } %1, 0
+ %3 = extractvalue { i64, i64 } %1, 1
+ %4 = or i64 %2, %3
+ ret i64 %4
+}
+
+define i16 @test_bsr16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_bsr16:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsrw %di, %ax
+; GENERIC-NEXT: bsrw (%rsi), %cx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsr16:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsrw %di, %ax
+; ATOM-NEXT: bsrw (%rsi), %cx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsr16:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsrw %di, %ax
+; SLM-NEXT: bsrw (%rsi), %cx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsr16:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsrw %di, %ax
+; SANDY-NEXT: bsrw (%rsi), %cx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsr16:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsrw %di, %ax
+; HASWELL-NEXT: bsrw (%rsi), %cx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsr16:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsrw %di, %ax
+; BROADWELL-NEXT: bsrw (%rsi), %cx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsr16:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsrw %di, %ax
+; SKYLAKE-NEXT: bsrw (%rsi), %cx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsr16:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsrw %di, %ax
+; SKX-NEXT: bsrw (%rsi), %cx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsr16:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsrw %di, %ax
+; BTVER2-NEXT: bsrw (%rsi), %cx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsr16:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsrw %di, %ax
+; ZNVER1-NEXT: bsrw (%rsi), %cx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i16, i16 } asm sideeffect "bsr $2, $0 \0A\09 bsr $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i16 %a0, i16* %a1)
+ %2 = extractvalue { i16, i16 } %1, 0
+ %3 = extractvalue { i16, i16 } %1, 1
+ %4 = or i16 %2, %3
+ ret i16 %4
+}
+define i32 @test_bsr32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_bsr32:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsrl %edi, %eax
+; GENERIC-NEXT: bsrl (%rsi), %ecx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsr32:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsrl %edi, %eax
+; ATOM-NEXT: bsrl (%rsi), %ecx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsr32:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsrl %edi, %eax
+; SLM-NEXT: bsrl (%rsi), %ecx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsr32:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsrl %edi, %eax
+; SANDY-NEXT: bsrl (%rsi), %ecx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsr32:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsrl %edi, %eax
+; HASWELL-NEXT: bsrl (%rsi), %ecx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsr32:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsrl %edi, %eax
+; BROADWELL-NEXT: bsrl (%rsi), %ecx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsr32:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsrl %edi, %eax
+; SKYLAKE-NEXT: bsrl (%rsi), %ecx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsr32:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsrl %edi, %eax
+; SKX-NEXT: bsrl (%rsi), %ecx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsr32:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsrl %edi, %eax
+; BTVER2-NEXT: bsrl (%rsi), %ecx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsr32:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsrl %edi, %eax
+; ZNVER1-NEXT: bsrl (%rsi), %ecx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i32, i32 } asm sideeffect "bsr $2, $0 \0A\09 bsr $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i32 %a0, i32* %a1)
+ %2 = extractvalue { i32, i32 } %1, 0
+ %3 = extractvalue { i32, i32 } %1, 1
+ %4 = or i32 %2, %3
+ ret i32 %4
+}
+define i64 @test_bsr64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_bsr64:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsrq %rdi, %rax
+; GENERIC-NEXT: bsrq (%rsi), %rcx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsr64:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsrq %rdi, %rax
+; ATOM-NEXT: bsrq (%rsi), %rcx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsr64:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsrq %rdi, %rax
+; SLM-NEXT: bsrq (%rsi), %rcx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsr64:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsrq %rdi, %rax
+; SANDY-NEXT: bsrq (%rsi), %rcx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsr64:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsrq %rdi, %rax
+; HASWELL-NEXT: bsrq (%rsi), %rcx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsr64:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsrq %rdi, %rax
+; BROADWELL-NEXT: bsrq (%rsi), %rcx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsr64:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsrq %rdi, %rax
+; SKYLAKE-NEXT: bsrq (%rsi), %rcx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsr64:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsrq %rdi, %rax
+; SKX-NEXT: bsrq (%rsi), %rcx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsr64:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsrq %rdi, %rax
+; BTVER2-NEXT: bsrq (%rsi), %rcx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsr64:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsrq %rdi, %rax
+; ZNVER1-NEXT: bsrq (%rsi), %rcx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i64, i64 } asm sideeffect "bsr $2, $0 \0A\09 bsr $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i64 %a0, i64* %a1)
+ %2 = extractvalue { i64, i64 } %1, 0
+ %3 = extractvalue { i64, i64 } %1, 1
+ %4 = or i64 %2, %3
+ ret i64 %4
+}
+
+define i32 @test_bswap32(i32 %a0) optsize {
+; GENERIC-LABEL: test_bswap32:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: bswapl %edi # sched: [2:1.00]
+; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bswap32:
+; ATOM: # BB#0:
+; ATOM-NEXT: bswapl %edi # sched: [1:1.00]
+; ATOM-NEXT: movl %edi, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bswap32:
+; SLM: # BB#0:
+; SLM-NEXT: bswapl %edi # sched: [1:0.50]
+; SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bswap32:
+; SANDY: # BB#0:
+; SANDY-NEXT: bswapl %edi # sched: [2:1.00]
+; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bswap32:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: bswapl %edi # sched: [2:0.50]
+; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bswap32:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: bswapl %edi # sched: [2:0.50]
+; BROADWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bswap32:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: bswapl %edi # sched: [2:0.50]
+; SKYLAKE-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bswap32:
+; SKX: # BB#0:
+; SKX-NEXT: bswapl %edi # sched: [2:0.50]
+; SKX-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bswap32:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: bswapl %edi # sched: [1:0.50]
+; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bswap32:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: bswapl %edi # sched: [1:1.00]
+; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = tail call i32 asm "bswap $0", "=r,0"(i32 %a0) nounwind
+ ret i32 %1
+}
+define i64 @test_bswap64(i64 %a0) optsize {
+; GENERIC-LABEL: test_bswap64:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: bswapq %rdi # sched: [2:1.00]
+; GENERIC-NEXT: movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bswap64:
+; ATOM: # BB#0:
+; ATOM-NEXT: bswapq %rdi # sched: [1:1.00]
+; ATOM-NEXT: movq %rdi, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bswap64:
+; SLM: # BB#0:
+; SLM-NEXT: bswapq %rdi # sched: [1:0.50]
+; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bswap64:
+; SANDY: # BB#0:
+; SANDY-NEXT: bswapq %rdi # sched: [2:1.00]
+; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bswap64:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: bswapq %rdi # sched: [2:0.50]
+; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bswap64:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: bswapq %rdi # sched: [2:0.50]
+; BROADWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bswap64:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: bswapq %rdi # sched: [2:0.50]
+; SKYLAKE-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bswap64:
+; SKX: # BB#0:
+; SKX-NEXT: bswapq %rdi # sched: [2:0.50]
+; SKX-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bswap64:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: bswapq %rdi # sched: [1:0.50]
+; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bswap64:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: bswapq %rdi # sched: [1:1.00]
+; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = tail call i64 asm "bswap $0", "=r,0"(i64 %a0) nounwind
+ ret i64 %1
+}
diff --git a/test/CodeGen/X86/select-mmx.ll b/test/CodeGen/X86/select-mmx.ll
index 795990e3c32..7ad8b6f1b9c 100644
--- a/test/CodeGen/X86/select-mmx.ll
+++ b/test/CodeGen/X86/select-mmx.ll
@@ -48,6 +48,7 @@ define i64 @test47(i64 %arg) {
; I32-NEXT: movl {{[0-9]+}}(%esp), %edx
; I32-NEXT: movl %ebp, %esp
; I32-NEXT: popl %ebp
+; I32-NEXT: .cfi_def_cfa %esp, 4
; I32-NEXT: retl
%cond = icmp eq i64 %arg, 0
%slct = select i1 %cond, x86_mmx bitcast (i64 7 to x86_mmx), x86_mmx bitcast (i64 0 to x86_mmx)
@@ -100,6 +101,7 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) {
; I32-NEXT: movl {{[0-9]+}}(%esp), %edx
; I32-NEXT: movl %ebp, %esp
; I32-NEXT: popl %ebp
+; I32-NEXT: .cfi_def_cfa %esp, 4
; I32-NEXT: retl
%cond = icmp eq i64 %arg, 0
%xmmx = bitcast i64 %x to x86_mmx
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index 52225397ef0..c3674639eab 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -15,7 +15,6 @@ define i32 @test1(%0* %p, %0* %q, i1 %r) nounwind {
; CHECK-NEXT: cmovneq %rdi, %rsi
; CHECK-NEXT: movl (%rsi), %eax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test1:
; MCU: # BB#0:
@@ -45,7 +44,7 @@ define i32 @test2() nounwind {
; GENERIC-NEXT: callq _return_false
; GENERIC-NEXT: xorl %ecx, %ecx
; GENERIC-NEXT: testb $1, %al
-; GENERIC-NEXT: movl $-480, %eax
+; GENERIC-NEXT: movl $-480, %eax ## imm = 0xFE20
; GENERIC-NEXT: cmovnel %ecx, %eax
; GENERIC-NEXT: shll $3, %eax
; GENERIC-NEXT: cmpl $32768, %eax ## imm = 0x8000
@@ -55,14 +54,13 @@ define i32 @test2() nounwind {
; GENERIC-NEXT: popq %rcx
; GENERIC-NEXT: retq
; GENERIC-NEXT: LBB1_1: ## %bb90
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test2:
; ATOM: ## BB#0: ## %entry
; ATOM-NEXT: pushq %rax
; ATOM-NEXT: callq _return_false
; ATOM-NEXT: xorl %ecx, %ecx
-; ATOM-NEXT: movl $-480, %edx
+; ATOM-NEXT: movl $-480, %edx ## imm = 0xFE20
; ATOM-NEXT: testb $1, %al
; ATOM-NEXT: cmovnel %ecx, %edx
; ATOM-NEXT: shll $3, %edx
@@ -73,17 +71,16 @@ define i32 @test2() nounwind {
; ATOM-NEXT: popq %rcx
; ATOM-NEXT: retq
; ATOM-NEXT: LBB1_1: ## %bb90
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test2:
; MCU: # BB#0: # %entry
; MCU-NEXT: calll return_false
-; MCU-NEXT: xorl %ecx, %ecx
+; MCU-NEXT: xorl %ecx, %ecx
; MCU-NEXT: testb $1, %al
; MCU-NEXT: jne .LBB1_2
; MCU-NEXT: # BB#1: # %entry
; MCU-NEXT: movl $-480, %ecx # imm = 0xFE20
-; MCU-NEXT: .LBB1_2:
+; MCU-NEXT: .LBB1_2: # %entry
; MCU-NEXT: shll $3, %ecx
; MCU-NEXT: cmpl $32768, %ecx # imm = 0x8000
; MCU-NEXT: jge .LBB1_3
@@ -116,7 +113,6 @@ define float @test3(i32 %x) nounwind readnone {
; CHECK-NEXT: leaq {{.*}}(%rip), %rcx
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test3:
; MCU: # BB#0: # %entry
@@ -140,7 +136,6 @@ define signext i8 @test4(i8* nocapture %P, double %F) nounwind readonly {
; CHECK-NEXT: seta %al
; CHECK-NEXT: movsbl (%rdi,%rax,4), %eax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test4:
; MCU: # BB#0: # %entry
@@ -175,7 +170,6 @@ define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind {
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; CHECK-NEXT: movd %xmm0, (%rsi)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test5:
; MCU: # BB#0:
@@ -211,7 +205,6 @@ define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
; CHECK-NEXT: mulps %xmm0, %xmm0
; CHECK-NEXT: movaps %xmm0, (%rsi)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test6:
; MCU: # BB#0:
@@ -283,7 +276,6 @@ define x86_fp80 @test7(i32 %tmp8) nounwind {
; CHECK-NEXT: leaq {{.*}}(%rip), %rcx
; CHECK-NEXT: fldt (%rax,%rcx)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test7:
; MCU: # BB#0:
@@ -333,7 +325,6 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
; GENERIC-NEXT: movq %xmm1, 16(%rsi)
; GENERIC-NEXT: movdqa %xmm0, (%rsi)
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test8:
; ATOM: ## BB#0:
@@ -366,7 +357,6 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
; ATOM-NEXT: movdqa %xmm0, (%rsi)
; ATOM-NEXT: movq %xmm1, 16(%rsi)
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test8:
; MCU: # BB#0:
@@ -456,7 +446,6 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; GENERIC-NEXT: sbbq %rax, %rax
; GENERIC-NEXT: orq %rsi, %rax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test9:
; ATOM: ## BB#0:
@@ -466,7 +455,6 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test9:
; MCU: # BB#0:
@@ -493,7 +481,6 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; GENERIC-NEXT: sbbq %rax, %rax
; GENERIC-NEXT: orq %rsi, %rax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test9a:
; ATOM: ## BB#0:
@@ -503,7 +490,6 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test9a:
; MCU: # BB#0:
@@ -528,7 +514,6 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; GENERIC-NEXT: sbbq %rax, %rax
; GENERIC-NEXT: orq %rsi, %rax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test9b:
; ATOM: ## BB#0:
@@ -538,7 +523,6 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test9b:
; MCU: # BB#0:
@@ -566,7 +550,6 @@ define i64 @test10(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; CHECK-NEXT: setne %al
; CHECK-NEXT: leaq -1(%rax,%rax), %rax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test10:
; MCU: # BB#0:
@@ -592,7 +575,6 @@ define i64 @test11(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; CHECK-NEXT: notq %rax
; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test11:
; MCU: # BB#0:
@@ -619,7 +601,6 @@ define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; CHECK-NEXT: notq %rax
; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test11a:
; MCU: # BB#0:
@@ -649,7 +630,6 @@ define noalias i8* @test12(i64 %count) nounwind ssp noredzone {
; GENERIC-NEXT: movq $-1, %rdi
; GENERIC-NEXT: cmovnoq %rax, %rdi
; GENERIC-NEXT: jmp __Znam ## TAILCALL
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test12:
; ATOM: ## BB#0: ## %entry
@@ -659,7 +639,6 @@ define noalias i8* @test12(i64 %count) nounwind ssp noredzone {
; ATOM-NEXT: movq $-1, %rdi
; ATOM-NEXT: cmovnoq %rax, %rdi
; ATOM-NEXT: jmp __Znam ## TAILCALL
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test12:
; MCU: # BB#0: # %entry
@@ -710,7 +689,6 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
; GENERIC-NEXT: cmpl %esi, %edi
; GENERIC-NEXT: sbbl %eax, %eax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test13:
; ATOM: ## BB#0:
@@ -721,7 +699,6 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test13:
; MCU: # BB#0:
@@ -741,7 +718,6 @@ define i32 @test14(i32 %a, i32 %b) nounwind {
; CHECK-NEXT: setae %al
; CHECK-NEXT: negl %eax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test14:
; MCU: # BB#0:
@@ -763,7 +739,6 @@ define i32 @test15(i32 %x) nounwind {
; GENERIC-NEXT: negl %edi
; GENERIC-NEXT: sbbl %eax, %eax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test15:
; ATOM: ## BB#0: ## %entry
@@ -774,7 +749,6 @@ define i32 @test15(i32 %x) nounwind {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test15:
; MCU: # BB#0: # %entry
@@ -826,7 +800,6 @@ define i16 @test17(i16 %x) nounwind {
; GENERIC-NEXT: sbbl %eax, %eax
; GENERIC-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test17:
; ATOM: ## BB#0: ## %entry
@@ -838,7 +811,6 @@ define i16 @test17(i16 %x) nounwind {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test17:
; MCU: # BB#0: # %entry
@@ -859,7 +831,6 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
; GENERIC-NEXT: cmovgel %edx, %esi
; GENERIC-NEXT: movl %esi, %eax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test18:
; ATOM: ## BB#0:
@@ -869,7 +840,6 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test18:
; MCU: # BB#0:
diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll
index 20c77a4a517..5ae2cc5f35c 100644
--- a/test/CodeGen/X86/setcc-lowering.ll
+++ b/test/CodeGen/X86/setcc-lowering.ll
@@ -23,10 +23,9 @@ define <8 x i16> @pr25080(<8 x i32> %a) {
;
; KNL-32-LABEL: pr25080:
; KNL-32: # BB#0: # %entry
-; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607]
-; KNL-32-NEXT: vpand %ymm1, %ymm0, %ymm0
-; KNL-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-32-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; KNL-32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-32-NEXT: vbroadcastss {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607]
+; KNL-32-NEXT: vptestnmd %zmm1, %zmm0, %k0
; KNL-32-NEXT: movb $15, %al
; KNL-32-NEXT: kmovw %eax, %k1
; KNL-32-NEXT: korw %k1, %k0, %k1
@@ -90,6 +89,7 @@ define void @pr26232(i64 %a, <16 x i1> %b) {
; KNL-32-NEXT: jne .LBB1_1
; KNL-32-NEXT: # BB#2: # %for_exit600
; KNL-32-NEXT: popl %esi
+; KNL-32-NEXT: .cfi_def_cfa_offset 4
; KNL-32-NEXT: retl
allocas:
br label %for_test11.preheader
diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll
index 79cf0f2c8f1..a2767205fe2 100644
--- a/test/CodeGen/X86/shrink_vmul.ll
+++ b/test/CodeGen/X86/shrink_vmul.ll
@@ -31,6 +31,7 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: movq %xmm1, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi8:
@@ -89,6 +90,7 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_4xi8:
@@ -148,6 +150,7 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_8xi8:
@@ -220,6 +223,7 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: movdqu %xmm4, 16(%esi,%ecx,4)
; X86-NEXT: movdqu %xmm3, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_16xi8:
@@ -288,6 +292,7 @@ define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: movq %xmm1, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi16:
@@ -342,6 +347,7 @@ define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_4xi16:
@@ -399,6 +405,7 @@ define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_8xi16:
@@ -469,6 +476,7 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6
; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_16xi16:
@@ -541,6 +549,7 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b,
; X86-NEXT: psrad $16, %xmm0
; X86-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi8_sext:
@@ -606,6 +615,7 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X86-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi8_sext_zext:
@@ -666,6 +676,7 @@ define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: movq %xmm1, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi16_sext:
@@ -733,6 +744,7 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X86-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi16_sext_zext:
@@ -813,6 +825,7 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %
; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_16xi16_sext:
diff --git a/test/CodeGen/X86/sse-intrinsics-x86.ll b/test/CodeGen/X86/sse-intrinsics-x86.ll
index f178e18a259..ca74ee5732d 100644
--- a/test/CodeGen/X86/sse-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse-intrinsics-x86.ll
@@ -401,15 +401,10 @@ define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) {
; SSE-NEXT: rcpps %xmm0, %xmm0 ## encoding: [0x0f,0x53,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; AVX2-LABEL: test_x86_sse_rcp_ps:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_rcp_ps:
-; SKX: ## BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4c,0xc0]
-; SKX-NEXT: retl ## encoding: [0xc3]
+; VCHECK-LABEL: test_x86_sse_rcp_ps:
+; VCHECK: ## BB#0:
+; VCHECK-NEXT: vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0]
+; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -438,15 +433,10 @@ define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) {
; SSE-NEXT: rsqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x52,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; AVX2-LABEL: test_x86_sse_rsqrt_ps:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_rsqrt_ps:
-; SKX: ## BB#0:
-; SKX-NEXT: vrsqrt14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4e,0xc0]
-; SKX-NEXT: retl ## encoding: [0xc3]
+; VCHECK-LABEL: test_x86_sse_rsqrt_ps:
+; VCHECK: ## BB#0:
+; VCHECK-NEXT: vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0]
+; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -475,10 +465,15 @@ define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
; SSE-NEXT: sqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse_sqrt_ps:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse_sqrt_ps:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse_sqrt_ps:
+; SKX: ## BB#0:
+; SKX-NEXT: vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -491,10 +486,15 @@ define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
; SSE-NEXT: sqrtss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse_sqrt_ss:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse_sqrt_ss:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse_sqrt_ss:
+; SKX: ## BB#0:
+; SKX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll
index b5c2bff4b8f..d3c995197e8 100644
--- a/test/CodeGen/X86/sse-schedule.ll
+++ b/test/CodeGen/X86/sse-schedule.ll
@@ -2547,8 +2547,8 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
;
; SKX-LABEL: test_rcpps:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm0 # sched: [4:1.00]
-; SKX-NEXT: vrcp14ps (%rdi), %xmm1 # sched: [10:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpps (%rdi), %xmm1 # sched: [10:1.00]
; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
@@ -2719,8 +2719,8 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
;
; SKX-LABEL: test_rsqrtps:
; SKX: # BB#0:
-; SKX-NEXT: vrsqrt14ps %xmm0, %xmm0 # sched: [4:1.00]
-; SKX-NEXT: vrsqrt14ps (%rdi), %xmm1 # sched: [10:1.00]
+; SKX-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [10:1.00]
; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index d4047faad9b..72c68c56638 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -1592,10 +1592,15 @@ define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
; SSE-NEXT: sqrtpd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse2_sqrt_pd:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse2_sqrt_pd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_sqrt_pd:
+; SKX: ## BB#0:
+; SKX-NEXT: vsqrtpd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1608,10 +1613,15 @@ define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
; SSE-NEXT: sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse2_sqrt_sd:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse2_sqrt_sd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_sqrt_sd:
+; SKX: ## BB#0:
+; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1637,7 +1647,7 @@ define <2 x double> @test_x86_sse2_sqrt_sd_vec_load(<2 x double>* %a0) {
; SKX: ## BB#0:
; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SKX-NEXT: vmovapd (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x00]
-; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
+; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%a1 = load <2 x double>, <2 x double>* %a0, align 16
%res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a1) ; <<2 x double>> [#uses=1]
diff --git a/test/CodeGen/X86/statepoint-call-lowering.ll b/test/CodeGen/X86/statepoint-call-lowering.ll
index bd2dd53b654..d80c87b99b6 100644
--- a/test/CodeGen/X86/statepoint-call-lowering.ll
+++ b/test/CodeGen/X86/statepoint-call-lowering.ll
@@ -83,6 +83,7 @@ define i1 @test_relocate(i32 addrspace(1)* %a) gc "statepoint-example" {
; CHECK: callq return_i1
; CHECK-NEXT: .Ltmp5:
; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
entry:
%safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a)
diff --git a/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll b/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
index b88ca03805f..90f2002e2d4 100644
--- a/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
+++ b/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
@@ -69,6 +69,7 @@ define i1 @test_relocate(i32 addrspace(1)* %a) gc "statepoint-example" {
; CHECK: callq return_i1
; CHECK-NEXT: .Ltmp4:
; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
entry:
%safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 1, i32 0, i32 0, i32 addrspace(1)* %a)
diff --git a/test/CodeGen/X86/statepoint-invoke.ll b/test/CodeGen/X86/statepoint-invoke.ll
index 784b932addc..5aa902546c1 100644
--- a/test/CodeGen/X86/statepoint-invoke.ll
+++ b/test/CodeGen/X86/statepoint-invoke.ll
@@ -142,6 +142,7 @@ normal_return:
; CHECK-LABEL: %normal_return
; CHECK: xorl %eax, %eax
; CHECK-NEXT: popq
+ ; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%null.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 13, i32 13)
%undef.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 14, i32 14)
@@ -169,6 +170,7 @@ entry:
normal_return:
; CHECK: leaq
; CHECK-NEXT: popq
+ ; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%aa.rel = call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %sp, i32 13, i32 13)
%aa.converted = bitcast i32 addrspace(1)* %aa.rel to i64 addrspace(1)*
@@ -177,6 +179,7 @@ normal_return:
exceptional_return:
; CHECK: movl $15
; CHECK-NEXT: popq
+ ; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%landing_pad = landingpad token
cleanup
diff --git a/test/CodeGen/X86/throws-cfi-fp.ll b/test/CodeGen/X86/throws-cfi-fp.ll
new file mode 100644
index 00000000000..bacd965054c
--- /dev/null
+++ b/test/CodeGen/X86/throws-cfi-fp.ll
@@ -0,0 +1,98 @@
+; RUN: llc %s -o - | FileCheck %s
+
+; ModuleID = 'throws-cfi-fp.cpp'
+source_filename = "throws-cfi-fp.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__clang_call_terminate = comdat any
+
+@_ZL11ShouldThrow = internal unnamed_addr global i1 false, align 1
+@_ZTIi = external constant i8*
+@str = private unnamed_addr constant [20 x i8] c"Threw an exception!\00"
+
+; Function Attrs: uwtable
+define void @_Z6throwsv() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+
+; CHECK-LABEL: _Z6throwsv:
+; CHECK: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB0_1:
+; CHECK-NEXT: .cfi_def_cfa %rbp, 16
+
+entry:
+ %.b5 = load i1, i1* @_ZL11ShouldThrow, align 1
+ br i1 %.b5, label %if.then, label %try.cont
+
+if.then: ; preds = %entry
+ %exception = tail call i8* @__cxa_allocate_exception(i64 4)
+ %0 = bitcast i8* %exception to i32*
+ store i32 1, i32* %0, align 16
+ invoke void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+ to label %unreachable unwind label %lpad
+
+lpad: ; preds = %if.then
+ %1 = landingpad { i8*, i32 }
+ catch i8* null
+ %2 = extractvalue { i8*, i32 } %1, 0
+ %3 = tail call i8* @__cxa_begin_catch(i8* %2)
+ %puts = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @str, i64 0, i64 0))
+ invoke void @__cxa_rethrow()
+ to label %unreachable unwind label %lpad1
+
+lpad1: ; preds = %lpad
+ %4 = landingpad { i8*, i32 }
+ cleanup
+ invoke void @__cxa_end_catch()
+ to label %eh.resume unwind label %terminate.lpad
+
+try.cont: ; preds = %entry
+ ret void
+
+eh.resume: ; preds = %lpad1
+ resume { i8*, i32 } %4
+
+terminate.lpad: ; preds = %lpad1
+ %5 = landingpad { i8*, i32 }
+ catch i8* null
+ %6 = extractvalue { i8*, i32 } %5, 0
+ tail call void @__clang_call_terminate(i8* %6)
+ unreachable
+
+unreachable: ; preds = %lpad, %if.then
+ unreachable
+}
+
+declare i8* @__cxa_allocate_exception(i64)
+
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_rethrow()
+
+declare void @__cxa_end_catch()
+
+; Function Attrs: noinline noreturn nounwind
+declare void @__clang_call_terminate(i8*)
+
+declare void @_ZSt9terminatev()
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly)
+
+attributes #0 = { "no-frame-pointer-elim"="true" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!8, !9, !10}
+
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 6.0.0 (https://github.com/llvm-mirror/clang.git 316ebefb7fff8ad324a08a694347500b6cd7c95f) (https://github.com/llvm-mirror/llvm.git dcae9be81fc17cdfbe989402354d3c8ecd0a2c79)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "throws-cfi-fp.cpp", directory: "epilogue-dwarf/test")
+!4 = !{}
+!5 = !{}
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
diff --git a/test/CodeGen/X86/throws-cfi-no-fp.ll b/test/CodeGen/X86/throws-cfi-no-fp.ll
new file mode 100644
index 00000000000..1483e6b8483
--- /dev/null
+++ b/test/CodeGen/X86/throws-cfi-no-fp.ll
@@ -0,0 +1,97 @@
+; RUN: llc %s -o - | FileCheck %s
+
+; ModuleID = 'throws-cfi-no-fp.cpp'
+source_filename = "throws-cfi-no-fp.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__clang_call_terminate = comdat any
+
+@_ZL11ShouldThrow = internal unnamed_addr global i1 false, align 1
+@_ZTIi = external constant i8*
+@str = private unnamed_addr constant [20 x i8] c"Threw an exception!\00"
+
+; Function Attrs: uwtable
+define void @_Z6throwsv() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+
+; CHECK-LABEL: _Z6throwsv:
+; CHECK: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB0_1:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+
+entry:
+ %.b5 = load i1, i1* @_ZL11ShouldThrow, align 1
+ br i1 %.b5, label %if.then, label %try.cont
+
+if.then: ; preds = %entry
+ %exception = tail call i8* @__cxa_allocate_exception(i64 4)
+ %0 = bitcast i8* %exception to i32*
+ store i32 1, i32* %0, align 16
+ invoke void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+ to label %unreachable unwind label %lpad
+
+lpad: ; preds = %if.then
+ %1 = landingpad { i8*, i32 }
+ catch i8* null
+ %2 = extractvalue { i8*, i32 } %1, 0
+ %3 = tail call i8* @__cxa_begin_catch(i8* %2)
+ %puts = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @str, i64 0, i64 0))
+ invoke void @__cxa_rethrow() #4
+ to label %unreachable unwind label %lpad1
+
+lpad1: ; preds = %lpad
+ %4 = landingpad { i8*, i32 }
+ cleanup
+ invoke void @__cxa_end_catch()
+ to label %eh.resume unwind label %terminate.lpad
+
+try.cont: ; preds = %entry
+ ret void
+
+eh.resume: ; preds = %lpad1
+ resume { i8*, i32 } %4
+
+terminate.lpad: ; preds = %lpad1
+ %5 = landingpad { i8*, i32 }
+ catch i8* null
+ %6 = extractvalue { i8*, i32 } %5, 0
+ tail call void @__clang_call_terminate(i8* %6)
+ unreachable
+
+unreachable: ; preds = %lpad, %if.then
+ unreachable
+}
+
+declare i8* @__cxa_allocate_exception(i64)
+
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_rethrow()
+
+declare void @__cxa_end_catch()
+
+; Function Attrs: noinline noreturn nounwind
+declare void @__clang_call_terminate(i8*)
+
+declare void @_ZSt9terminatev()
+
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly)
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!8, !9, !10}
+
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 6.0.0 (https://github.com/llvm-mirror/clang.git 316ebefb7fff8ad324a08a694347500b6cd7c95f) (https://github.com/llvm-mirror/llvm.git dcae9be81fc17cdfbe989402354d3c8ecd0a2c79)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "throws-cfi-no-fp.cpp", directory: "epilogue-dwarf/test")
+!4 = !{}
+!5 = !{}
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
diff --git a/test/CodeGen/X86/var-permute-128.ll b/test/CodeGen/X86/var-permute-128.ll
index f74343d7f2a..208fab88b58 100644
--- a/test/CodeGen/X86/var-permute-128.ll
+++ b/test/CodeGen/X86/var-permute-128.ll
@@ -143,35 +143,40 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-NEXT: retq
;
-; AVX-LABEL: var_shuffle_v8i16:
-; AVX: # BB#0:
-; AVX-NEXT: vmovd %xmm1, %eax
-; AVX-NEXT: vpextrw $1, %xmm1, %r10d
-; AVX-NEXT: vpextrw $2, %xmm1, %ecx
-; AVX-NEXT: vpextrw $3, %xmm1, %edx
-; AVX-NEXT: vpextrw $4, %xmm1, %esi
-; AVX-NEXT: vpextrw $5, %xmm1, %edi
-; AVX-NEXT: vpextrw $6, %xmm1, %r8d
-; AVX-NEXT: vpextrw $7, %xmm1, %r9d
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: andl $7, %eax
-; AVX-NEXT: andl $7, %r10d
-; AVX-NEXT: andl $7, %ecx
-; AVX-NEXT: andl $7, %edx
-; AVX-NEXT: andl $7, %esi
-; AVX-NEXT: andl $7, %edi
-; AVX-NEXT: andl $7, %r8d
-; AVX-NEXT: andl $7, %r9d
-; AVX-NEXT: movzwl -24(%rsp,%rax,2), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpinsrw $1, -24(%rsp,%r10,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $2, -24(%rsp,%rcx,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $3, -24(%rsp,%rdx,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $4, -24(%rsp,%rsi,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $5, -24(%rsp,%rdi,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $6, -24(%rsp,%r8,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $7, -24(%rsp,%r9,2), %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVXNOVLBW-LABEL: var_shuffle_v8i16:
+; AVXNOVLBW: # BB#0:
+; AVXNOVLBW-NEXT: vmovd %xmm1, %eax
+; AVXNOVLBW-NEXT: vpextrw $1, %xmm1, %r10d
+; AVXNOVLBW-NEXT: vpextrw $2, %xmm1, %ecx
+; AVXNOVLBW-NEXT: vpextrw $3, %xmm1, %edx
+; AVXNOVLBW-NEXT: vpextrw $4, %xmm1, %esi
+; AVXNOVLBW-NEXT: vpextrw $5, %xmm1, %edi
+; AVXNOVLBW-NEXT: vpextrw $6, %xmm1, %r8d
+; AVXNOVLBW-NEXT: vpextrw $7, %xmm1, %r9d
+; AVXNOVLBW-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVXNOVLBW-NEXT: andl $7, %eax
+; AVXNOVLBW-NEXT: andl $7, %r10d
+; AVXNOVLBW-NEXT: andl $7, %ecx
+; AVXNOVLBW-NEXT: andl $7, %edx
+; AVXNOVLBW-NEXT: andl $7, %esi
+; AVXNOVLBW-NEXT: andl $7, %edi
+; AVXNOVLBW-NEXT: andl $7, %r8d
+; AVXNOVLBW-NEXT: andl $7, %r9d
+; AVXNOVLBW-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVXNOVLBW-NEXT: vmovd %eax, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $1, -24(%rsp,%r10,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $2, -24(%rsp,%rcx,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $3, -24(%rsp,%rdx,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $4, -24(%rsp,%rsi,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $5, -24(%rsp,%rdi,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $6, -24(%rsp,%r8,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $7, -24(%rsp,%r9,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v8i16:
+; AVX512VLBW: # BB#0:
+; AVX512VLBW-NEXT: vpermw %xmm0, %xmm1, %xmm0
+; AVX512VLBW-NEXT: retq
%index0 = extractelement <8 x i16> %indices, i32 0
%index1 = extractelement <8 x i16> %indices, i32 1
%index2 = extractelement <8 x i16> %indices, i32 2
@@ -202,143 +207,13 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
; SSSE3-LABEL: var_shuffle_v16i8:
; SSSE3: # BB#0:
-; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm8
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm15
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm9
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm10
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm7
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm11
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm6
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm12
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm5
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm13
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm4
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm14
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: var_shuffle_v16i8:
; AVX: # BB#0:
-; AVX-NEXT: vpextrb $0, %xmm1, %eax
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movzbl (%rax,%rcx), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpextrb $1, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $1, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $2, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $2, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $3, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $3, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $4, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $4, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $5, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $5, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $6, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $6, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $7, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $7, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $8, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $8, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $9, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $9, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $10, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $10, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $11, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $11, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $12, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $12, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $13, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $13, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $14, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $14, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $15, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $15, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%index0 = extractelement <16 x i8> %indices, i32 0
%index1 = extractelement <16 x i8> %indices, i32 1
diff --git a/test/CodeGen/X86/var-permute-256.ll b/test/CodeGen/X86/var-permute-256.ll
index dff145314ea..beef4643c13 100644
--- a/test/CodeGen/X86/var-permute-256.ll
+++ b/test/CodeGen/X86/var-permute-256.ll
@@ -34,32 +34,69 @@ define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
-; INT256-LABEL: var_shuffle_v4i64:
-; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vmovq %xmm1, %rax
-; INT256-NEXT: andl $3, %eax
-; INT256-NEXT: vpextrq $1, %xmm1, %rcx
-; INT256-NEXT: andl $3, %ecx
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1
-; INT256-NEXT: vmovq %xmm1, %rdx
-; INT256-NEXT: andl $3, %edx
-; INT256-NEXT: vpextrq $1, %xmm1, %rsi
-; INT256-NEXT: andl $3, %esi
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; INT256-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; INT256-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; INT256-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; INT256-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; INT256-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; INT256-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
-; INT256-NEXT: retq
+; AVX2-LABEL: var_shuffle_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: andl $3, %eax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: andl $3, %ecx
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vmovq %xmm1, %rdx
+; AVX2-NEXT: andl $3, %edx
+; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT: andl $3, %esi
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v4i64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: andl $3, %eax
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512F-NEXT: andl $3, %ecx
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vmovq %xmm1, %rdx
+; AVX512F-NEXT: andl $3, %edx
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512F-NEXT: andl $3, %esi
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v4i64:
+; AVX512VLBW: # BB#0:
+; AVX512VLBW-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT: retq
%index0 = extractelement <4 x i64> %indices, i32 0
%index1 = extractelement <4 x i64> %indices, i32 1
%index2 = extractelement <4 x i64> %indices, i32 2
@@ -120,44 +157,7 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
;
; INT256-LABEL: var_shuffle_v8i32:
; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vpextrq $1, %xmm1, %r8
-; INT256-NEXT: movq %r8, %rcx
-; INT256-NEXT: shrq $30, %rcx
-; INT256-NEXT: vmovq %xmm1, %r9
-; INT256-NEXT: movq %r9, %rsi
-; INT256-NEXT: shrq $30, %rsi
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1
-; INT256-NEXT: vpextrq $1, %xmm1, %r10
-; INT256-NEXT: movq %r10, %rdi
-; INT256-NEXT: shrq $30, %rdi
-; INT256-NEXT: vmovq %xmm1, %rax
-; INT256-NEXT: movq %rax, %rdx
-; INT256-NEXT: shrq $30, %rdx
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: andl $7, %r9d
-; INT256-NEXT: andl $28, %esi
-; INT256-NEXT: andl $7, %r8d
-; INT256-NEXT: andl $28, %ecx
-; INT256-NEXT: andl $7, %eax
-; INT256-NEXT: andl $28, %edx
-; INT256-NEXT: andl $7, %r10d
-; INT256-NEXT: andl $28, %edi
-; INT256-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; INT256-NEXT: movq %rsp, %rax
-; INT256-NEXT: vpinsrd $1, (%rdx,%rax), %xmm0, %xmm0
-; INT256-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm0, %xmm0
-; INT256-NEXT: vpinsrd $3, (%rdi,%rax), %xmm0, %xmm0
-; INT256-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; INT256-NEXT: vpinsrd $1, (%rsi,%rax), %xmm1, %xmm1
-; INT256-NEXT: vpinsrd $2, (%rsp,%r8,4), %xmm1, %xmm1
-; INT256-NEXT: vpinsrd $3, (%rcx,%rax), %xmm1, %xmm1
-; INT256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
+; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
; INT256-NEXT: retq
%index0 = extractelement <8 x i32> %indices, i32 0
%index1 = extractelement <8 x i32> %indices, i32 1
@@ -250,68 +250,199 @@ define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwi
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
-; INT256-LABEL: var_shuffle_v16i16:
-; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm2
-; INT256-NEXT: vmovd %xmm2, %eax
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: movzwl (%rsp,%rax,2), %eax
-; INT256-NEXT: vmovd %eax, %xmm0
-; INT256-NEXT: vpextrw $1, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $2, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $3, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $4, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $5, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $6, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $7, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vmovd %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: movzwl (%rsp,%rax,2), %eax
-; INT256-NEXT: vmovd %eax, %xmm2
-; INT256-NEXT: vpextrw $1, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $2, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $3, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $4, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $5, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $6, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $7, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
-; INT256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
-; INT256-NEXT: retq
+; AVX2-LABEL: var_shuffle_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpextrw $1, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $2, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $3, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $4, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $5, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $6, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $7, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vpextrw $1, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $2, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $3, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $4, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $5, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $6, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $7, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v16i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vpextrw $1, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $2, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $3, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $4, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $5, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $6, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $7, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v16i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $64, %rsp
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovd %xmm2, %eax
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm0
+; AVX512VL-NEXT: vpextrw $1, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $2, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $3, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $4, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $5, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $6, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $7, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vmovd %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm2
+; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: movq %rbp, %rsp
+; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v16i16:
+; AVX512VLBW: # BB#0:
+; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT: retq
%index0 = extractelement <16 x i16> %indices, i32 0
%index1 = extractelement <16 x i16> %indices, i32 1
%index2 = extractelement <16 x i16> %indices, i32 2
@@ -492,133 +623,394 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
-; INT256-LABEL: var_shuffle_v32i8:
-; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm2
-; INT256-NEXT: vpextrb $0, %xmm2, %eax
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movq %rsp, %rcx
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vmovd %eax, %xmm0
-; INT256-NEXT: vpextrb $1, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $2, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $3, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $4, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $5, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $6, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $7, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $8, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $9, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $10, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $11, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $12, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $13, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $14, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $15, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $0, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vmovd %eax, %xmm2
-; INT256-NEXT: vpextrb $1, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $2, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $3, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $4, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $5, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $6, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $7, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $8, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $9, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $10, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $11, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $12, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $13, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $14, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $15, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; INT256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
-; INT256-NEXT: retq
+; AVX2-LABEL: var_shuffle_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movq %rsp, %rcx
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v32i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movq %rsp, %rcx
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vpextrb $1, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $2, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $3, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $4, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $5, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $6, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $7, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $8, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $9, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $10, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $11, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $12, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $13, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $14, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $15, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v32i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $64, %rsp
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movq %rsp, %rcx
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm0
+; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm2
+; AVX512VL-NEXT: vpextrb $1, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $2, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $3, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $4, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $5, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $6, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $7, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $8, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $9, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $10, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $11, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $12, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $13, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $14, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $15, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: movq %rbp, %rsp
+; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: retq
+;
+; VBMI-LABEL: var_shuffle_v32i8:
+; VBMI: # BB#0:
+; VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; VBMI-NEXT: retq
%index0 = extractelement <32 x i8> %indices, i32 0
%index1 = extractelement <32 x i8> %indices, i32 1
%index2 = extractelement <32 x i8> %indices, i32 2
@@ -744,30 +1136,65 @@ define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) noun
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
-; INT256-LABEL: var_shuffle_v4f64:
-; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vmovq %xmm1, %rax
-; INT256-NEXT: andl $3, %eax
-; INT256-NEXT: vpextrq $1, %xmm1, %rcx
-; INT256-NEXT: andl $3, %ecx
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1
-; INT256-NEXT: vmovq %xmm1, %rdx
-; INT256-NEXT: andl $3, %edx
-; INT256-NEXT: vpextrq $1, %xmm1, %rsi
-; INT256-NEXT: andl $3, %esi
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; INT256-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; INT256-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; INT256-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; INT256-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
-; INT256-NEXT: retq
+; AVX2-LABEL: var_shuffle_v4f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: andl $3, %eax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: andl $3, %ecx
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vmovq %xmm1, %rdx
+; AVX2-NEXT: andl $3, %edx
+; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT: andl $3, %esi
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v4f64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: andl $3, %eax
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512F-NEXT: andl $3, %ecx
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vmovq %xmm1, %rdx
+; AVX512F-NEXT: andl $3, %edx
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512F-NEXT: andl $3, %esi
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v4f64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v4f64:
+; AVX512VLBW: # BB#0:
+; AVX512VLBW-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT: retq
%index0 = extractelement <4 x i64> %indices, i32 0
%index1 = extractelement <4 x i64> %indices, i32 1
%index2 = extractelement <4 x i64> %indices, i32 2
@@ -828,44 +1255,7 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi
;
; INT256-LABEL: var_shuffle_v8f32:
; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vpextrq $1, %xmm1, %r8
-; INT256-NEXT: movq %r8, %rcx
-; INT256-NEXT: shrq $30, %rcx
-; INT256-NEXT: vmovq %xmm1, %r9
-; INT256-NEXT: movq %r9, %rdx
-; INT256-NEXT: shrq $30, %rdx
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1
-; INT256-NEXT: vpextrq $1, %xmm1, %r10
-; INT256-NEXT: movq %r10, %rdi
-; INT256-NEXT: shrq $30, %rdi
-; INT256-NEXT: vmovq %xmm1, %rax
-; INT256-NEXT: movq %rax, %rsi
-; INT256-NEXT: shrq $30, %rsi
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: andl $7, %r9d
-; INT256-NEXT: andl $28, %edx
-; INT256-NEXT: andl $7, %r8d
-; INT256-NEXT: andl $28, %ecx
-; INT256-NEXT: andl $7, %eax
-; INT256-NEXT: andl $28, %esi
-; INT256-NEXT: andl $7, %r10d
-; INT256-NEXT: andl $28, %edi
-; INT256-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; INT256-NEXT: movq %rsp, %rax
-; INT256-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; INT256-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; INT256-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; INT256-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; INT256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; INT256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; INT256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
-; INT256-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
+; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
; INT256-NEXT: retq
%index0 = extractelement <8 x i32> %indices, i32 0
%index1 = extractelement <8 x i32> %indices, i32 1
diff --git a/test/CodeGen/X86/var-permute-512.ll b/test/CodeGen/X86/var-permute-512.ll
index bd1f220ceb1..15c7a1c8b8b 100644
--- a/test/CodeGen/X86/var-permute-512.ll
+++ b/test/CodeGen/X86/var-permute-512.ll
@@ -6,47 +6,7 @@
define <8 x i64> @var_shuffle_v8i64(<8 x i64> %v, <8 x i64> %indices) nounwind {
; AVX512-LABEL: var_shuffle_v8i64:
; AVX512: # BB#0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vmovq %xmm1, %r8
-; AVX512-NEXT: andl $7, %r8d
-; AVX512-NEXT: vpextrq $1, %xmm1, %r9
-; AVX512-NEXT: andl $7, %r9d
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %r10
-; AVX512-NEXT: andl $7, %r10d
-; AVX512-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX512-NEXT: andl $7, %esi
-; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %rdi
-; AVX512-NEXT: andl $7, %edi
-; AVX512-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512-NEXT: andl $7, %eax
-; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512-NEXT: vmovq %xmm1, %rcx
-; AVX512-NEXT: andl $7, %ecx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512-NEXT: andl $7, %edx
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%index0 = extractelement <8 x i64> %indices, i32 0
%index1 = extractelement <8 x i64> %indices, i32 1
@@ -78,76 +38,7 @@ define <8 x i64> @var_shuffle_v8i64(<8 x i64> %v, <8 x i64> %indices) nounwind {
define <16 x i32> @var_shuffle_v16i32(<16 x i32> %v, <16 x i32> %indices) nounwind {
; AVX512-LABEL: var_shuffle_v16i32:
; AVX512: # BB#0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512-NEXT: vpextrq $1, %xmm4, %rax
-; AVX512-NEXT: vmovq %xmm4, %rdx
-; AVX512-NEXT: movq %rdx, %rcx
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: andl $15, %edx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT: movq %rsp, %rdx
-; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm0, %xmm0
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm3, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm0, %xmm0
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm4, %xmm3
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm3, %xmm3
-; AVX512-NEXT: vmovq %xmm2, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm3, %xmm3
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm4, %xmm2
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm2, %xmm2
-; AVX512-NEXT: vmovq %xmm1, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm2, %xmm2
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm4, %xmm1
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm1, %xmm1
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm1, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%index0 = extractelement <16 x i32> %indices, i32 0
%index1 = extractelement <16 x i32> %indices, i32 1
@@ -381,136 +272,7 @@ define <32 x i16> @var_shuffle_v32i16(<32 x i16> %v, <32 x i16> %indices) nounwi
;
; AVX512BW-LABEL: var_shuffle_v32i16:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512BW-NEXT: vmovd %xmm4, %eax
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm0
-; AVX512BW-NEXT: vpextrw $1, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $2, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $3, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $4, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $5, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $6, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $7, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm4
-; AVX512BW-NEXT: vpextrw $1, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $2, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $3, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $4, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $5, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $6, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm4, %xmm3
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm4
-; AVX512BW-NEXT: vpextrw $1, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $2, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $3, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $4, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $5, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $6, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT: vmovd %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm4
-; AVX512BW-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm4, %xmm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
+; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
%index0 = extractelement <32 x i16> %indices, i32 0
%index1 = extractelement <32 x i16> %indices, i32 1
@@ -1014,267 +776,10 @@ define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind {
; NOBW-NEXT: popq %rbp
; NOBW-NEXT: retq
;
-; AVX512BW-LABEL: var_shuffle_v64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movq %rsp, %rsi
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vmovd %edx, %xmm0
-; AVX512BW-NEXT: vpextrb $1, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $2, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $6, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $7, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $9, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $10, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $11, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $15, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vmovd %edx, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $2, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $3, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $4, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $5, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $12, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $13, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $15, %edx, %xmm4, %xmm3
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vmovd %edx, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $2, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $3, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $4, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $5, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $12, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $13, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $15, %edx, %xmm4, %xmm2
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: movzbl (%rcx,%rsi), %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm4
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: vpinsrb $1, (%rax,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: vpinsrb $2, (%rdx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: vpinsrb $3, (%rcx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: vpinsrb $4, (%rax,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: vpinsrb $5, (%rdx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: vpinsrb $6, (%rcx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: vpinsrb $7, (%rax,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: vpinsrb $8, (%rdx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: vpinsrb $9, (%rcx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: vpinsrb $10, (%rax,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: vpinsrb $11, (%rdx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: vpinsrb $12, (%rcx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: movzbl (%rcx,%rsi), %ecx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: movzbl (%rax,%rsi), %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm1
-; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1
-; AVX512BW-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: retq
+; VBMI-LABEL: var_shuffle_v64i8:
+; VBMI: # BB#0:
+; VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; VBMI-NEXT: retq
%index0 = extractelement <64 x i8> %indices, i32 0
%index1 = extractelement <64 x i8> %indices, i32 1
%index2 = extractelement <64 x i8> %indices, i32 2
@@ -1473,43 +978,7 @@ define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind {
define <8 x double> @var_shuffle_v8f64(<8 x double> %v, <8 x i64> %indices) nounwind {
; AVX512-LABEL: var_shuffle_v8f64:
; AVX512: # BB#0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vmovq %xmm1, %r8
-; AVX512-NEXT: andl $7, %r8d
-; AVX512-NEXT: vpextrq $1, %xmm1, %r9
-; AVX512-NEXT: andl $7, %r9d
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %r10
-; AVX512-NEXT: andl $7, %r10d
-; AVX512-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX512-NEXT: andl $7, %esi
-; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %rdi
-; AVX512-NEXT: andl $7, %edi
-; AVX512-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512-NEXT: andl $7, %eax
-; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512-NEXT: vmovq %xmm1, %rcx
-; AVX512-NEXT: andl $7, %ecx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512-NEXT: andl $7, %edx
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%index0 = extractelement <8 x i64> %indices, i32 0
%index1 = extractelement <8 x i64> %indices, i32 1
@@ -1541,76 +1010,7 @@ define <8 x double> @var_shuffle_v8f64(<8 x double> %v, <8 x i64> %indices) noun
define <16 x float> @var_shuffle_v16f32(<16 x float> %v, <16 x i32> %indices) nounwind {
; AVX512-LABEL: var_shuffle_v16f32:
; AVX512: # BB#0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512-NEXT: vpextrq $1, %xmm4, %rax
-; AVX512-NEXT: vmovq %xmm4, %rdx
-; AVX512-NEXT: movq %rdx, %rcx
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: andl $15, %edx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT: movq %rsp, %rdx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX512-NEXT: vmovq %xmm3, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],mem[0],xmm4[2,3]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
-; AVX512-NEXT: vmovq %xmm2, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],mem[0],xmm4[2,3]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; AVX512-NEXT: vmovq %xmm1, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],mem[0],xmm4[2,3]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%index0 = extractelement <16 x i32> %indices, i32 0
%index1 = extractelement <16 x i32> %indices, i32 1
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index c6335d751ed..2f52bab2803 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -2288,67 +2288,19 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
; VEX-NEXT: popq %rax
; VEX-NEXT: retq
;
-; AVX512F-LABEL: fptosi_2f16_to_4i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: vcvttss2si %xmm1, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vcvttss2si %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_2f16_to_4i32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_2f16_to_4i32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512DQ-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512DQ-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512DQ-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512DQ-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512DQ-NEXT: vcvttss2si %xmm1, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm1
-; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_2f16_to_4i32:
-; AVX512VLDQ: # BB#0:
-; AVX512VLDQ-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VLDQ-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VLDQ-NEXT: vcvttss2si %xmm1, %rax
-; AVX512VLDQ-NEXT: vmovq %rax, %xmm1
-; AVX512VLDQ-NEXT: vcvttss2si %xmm0, %rax
-; AVX512VLDQ-NEXT: vmovq %rax, %xmm0
-; AVX512VLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512VLDQ-NEXT: retq
+; AVX512-LABEL: fptosi_2f16_to_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vcvttss2si %xmm1, %rax
+; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vcvttss2si %xmm0, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX512-NEXT: retq
%cvt = fptosi <2 x half> %a to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i32> %ext
diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll
index 6e664ba98d9..9feff88a576 100644
--- a/test/CodeGen/X86/vector-half-conversions.ll
+++ b/test/CodeGen/X86/vector-half-conversions.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
;
@@ -9,35 +9,12 @@
;
define float @cvt_i16_to_f32(i16 %a0) nounwind {
-; AVX1-LABEL: cvt_i16_to_f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl %di, %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_i16_to_f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl %di, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_i16_to_f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_i16_to_f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_i16_to_f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = bitcast i16 %a0 to half
%2 = fpext half %1 to float
ret float %2
@@ -111,19 +88,18 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
; AVX512F-NEXT: shrq $48, %rdx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4i16_to_4f32:
@@ -222,19 +198,18 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
; AVX512F-NEXT: shrq $48, %rdx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8i16_to_4f32:
@@ -271,201 +246,54 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
}
define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
-; AVX1-LABEL: cvt_8i16_to_8f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX1-NEXT: movq %rdx, %r8
-; AVX1-NEXT: movq %rdx, %r10
-; AVX1-NEXT: movswl %dx, %r9d
-; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX1-NEXT: shrl $16, %edx
-; AVX1-NEXT: shrq $32, %r8
-; AVX1-NEXT: shrq $48, %r10
-; AVX1-NEXT: vmovq %xmm0, %rdi
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: movq %rdi, %rsi
-; AVX1-NEXT: movswl %di, %ecx
-; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX1-NEXT: shrl $16, %edi
-; AVX1-NEXT: shrq $32, %rax
-; AVX1-NEXT: shrq $48, %rsi
-; AVX1-NEXT: movswl %si, %esi
-; AVX1-NEXT: vmovd %esi, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: cwtl
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl %di, %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: vmovd %ecx, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: movswl %r10w, %eax
-; AVX1-NEXT: vmovd %eax, %xmm4
-; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX1-NEXT: movswl %r8w, %eax
-; AVX1-NEXT: vmovd %eax, %xmm5
-; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT: movswl %dx, %eax
-; AVX1-NEXT: vmovd %eax, %xmm6
-; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX1-NEXT: vmovd %r9d, %xmm7
-; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_8i16_to_8f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: movq %rdx, %r10
-; AVX2-NEXT: movswl %dx, %r9d
-; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX2-NEXT: shrl $16, %edx
-; AVX2-NEXT: shrq $32, %r8
-; AVX2-NEXT: shrq $48, %r10
-; AVX2-NEXT: vmovq %xmm0, %rdi
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: movq %rdi, %rsi
-; AVX2-NEXT: movswl %di, %ecx
-; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX2-NEXT: shrl $16, %edi
-; AVX2-NEXT: shrq $32, %rax
-; AVX2-NEXT: shrq $48, %rsi
-; AVX2-NEXT: movswl %si, %esi
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: cwtl
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl %di, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: vmovd %ecx, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: movswl %r10w, %eax
-; AVX2-NEXT: vmovd %eax, %xmm4
-; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX2-NEXT: movswl %r8w, %eax
-; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT: movswl %dx, %eax
-; AVX2-NEXT: vmovd %eax, %xmm6
-; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX2-NEXT: vmovd %r9d, %xmm7
-; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_8i16_to_8f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT: movq %rdx, %r8
-; AVX512F-NEXT: movq %rdx, %r9
-; AVX512F-NEXT: movswl %dx, %r10d
-; AVX512F-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX512F-NEXT: shrl $16, %edx
-; AVX512F-NEXT: shrq $32, %r8
-; AVX512F-NEXT: shrq $48, %r9
-; AVX512F-NEXT: vmovq %xmm0, %rdi
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq %rdi, %rcx
-; AVX512F-NEXT: movswl %di, %esi
-; AVX512F-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX512F-NEXT: shrl $16, %edi
-; AVX512F-NEXT: shrq $32, %rax
-; AVX512F-NEXT: shrq $48, %rcx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl %r9w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: movswl %r8w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl %dx, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: vmovd %r10d, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_8i16_to_8f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512VL-NEXT: movq %rdx, %r8
-; AVX512VL-NEXT: movq %rdx, %r10
-; AVX512VL-NEXT: movswl %dx, %r9d
-; AVX512VL-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX512VL-NEXT: shrl $16, %edx
-; AVX512VL-NEXT: shrq $32, %r8
-; AVX512VL-NEXT: shrq $48, %r10
-; AVX512VL-NEXT: vmovq %xmm0, %rdi
-; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq %rdi, %rsi
-; AVX512VL-NEXT: movswl %di, %ecx
-; AVX512VL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX512VL-NEXT: shrl $16, %edi
-; AVX512VL-NEXT: shrq $32, %rax
-; AVX512VL-NEXT: shrq $48, %rsi
-; AVX512VL-NEXT: movswl %si, %esi
-; AVX512VL-NEXT: vmovd %esi, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: cwtl
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %ecx, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl %r10w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: movswl %r8w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl %dx, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: vmovd %r9d, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_8i16_to_8f32:
+; ALL: # BB#0:
+; ALL-NEXT: vpextrq $1, %xmm0, %rdx
+; ALL-NEXT: movq %rdx, %r8
+; ALL-NEXT: movq %rdx, %r10
+; ALL-NEXT: movswl %dx, %r9d
+; ALL-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
+; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: shrq $32, %r8
+; ALL-NEXT: shrq $48, %r10
+; ALL-NEXT: vmovq %xmm0, %rdi
+; ALL-NEXT: movq %rdi, %rax
+; ALL-NEXT: movq %rdi, %rsi
+; ALL-NEXT: movswl %di, %ecx
+; ALL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
+; ALL-NEXT: shrl $16, %edi
+; ALL-NEXT: shrq $32, %rax
+; ALL-NEXT: shrq $48, %rsi
+; ALL-NEXT: movswl %si, %esi
+; ALL-NEXT: vmovd %esi, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vmovd %ecx, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: movswl %r10w, %eax
+; ALL-NEXT: vmovd %eax, %xmm4
+; ALL-NEXT: vcvtph2ps %xmm4, %xmm4
+; ALL-NEXT: movswl %r8w, %eax
+; ALL-NEXT: vmovd %eax, %xmm5
+; ALL-NEXT: vcvtph2ps %xmm5, %xmm5
+; ALL-NEXT: movswl %dx, %eax
+; ALL-NEXT: vmovd %eax, %xmm6
+; ALL-NEXT: vcvtph2ps %xmm6, %xmm6
+; ALL-NEXT: vmovd %r9d, %xmm7
+; ALL-NEXT: vcvtph2ps %xmm7, %xmm7
+; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; ALL-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
%2 = fpext <8 x half> %1 to <8 x float>
ret <8 x float> %2
@@ -664,98 +492,98 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
;
; AVX512F-LABEL: cvt_16i16_to_16f32:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm10
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm2
+; AVX512F-NEXT: vmovd %ecx, %xmm8
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm3
+; AVX512F-NEXT: vmovd %ecx, %xmm9
; AVX512F-NEXT: movswl %ax, %ecx
; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm4
+; AVX512F-NEXT: vmovd %eax, %xmm11
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vmovd %ecx, %xmm0
+; AVX512F-NEXT: vmovd %ecx, %xmm12
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm5
+; AVX512F-NEXT: vmovd %ecx, %xmm13
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm6
+; AVX512F-NEXT: vmovd %ecx, %xmm14
; AVX512F-NEXT: movswl %ax, %ecx
; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vmovq %xmm1, %rax
-; AVX512F-NEXT: vmovd %ecx, %xmm8
+; AVX512F-NEXT: vmovd %eax, %xmm15
+; AVX512F-NEXT: vmovq %xmm10, %rax
+; AVX512F-NEXT: vmovd %ecx, %xmm2
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm9
+; AVX512F-NEXT: vmovd %ecx, %xmm3
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm10
+; AVX512F-NEXT: vmovd %ecx, %xmm1
; AVX512F-NEXT: movswl %ax, %ecx
; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm11
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT: vmovd %ecx, %xmm1
+; AVX512F-NEXT: vmovd %eax, %xmm4
+; AVX512F-NEXT: vpextrq $1, %xmm10, %rax
+; AVX512F-NEXT: vmovd %ecx, %xmm10
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm12
+; AVX512F-NEXT: vmovd %ecx, %xmm5
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm13
+; AVX512F-NEXT: vmovd %ecx, %xmm6
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: shrl $16, %ecx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm14
+; AVX512F-NEXT: vmovd %ecx, %xmm7
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm15
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm16
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vcvtph2ps %ymm8, %zmm8
-; AVX512F-NEXT: vcvtph2ps %ymm9, %zmm9
-; AVX512F-NEXT: vcvtph2ps %ymm10, %zmm10
-; AVX512F-NEXT: vcvtph2ps %ymm11, %zmm11
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: vcvtph2ps %ymm12, %zmm12
-; AVX512F-NEXT: vcvtph2ps %ymm13, %zmm13
-; AVX512F-NEXT: vcvtph2ps %ymm14, %zmm14
-; AVX512F-NEXT: vcvtph2ps %ymm15, %zmm15
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm13[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm10[0],xmm1[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm9[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm6[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm16[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm8
+; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm9
+; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm11
+; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm12
+; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm13
+; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm14
+; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm15
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512F-NEXT: vcvtph2ps %xmm10, %xmm10
+; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_16i16_to_16f32:
@@ -863,35 +691,12 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
;
define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
-; AVX1-LABEL: load_cvt_i16_to_f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_i16_to_f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_i16_to_f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_i16_to_f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_i16_to_f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = load i16, i16* %a0
%2 = bitcast i16 %1 to half
%3 = fpext half %2 to float
@@ -899,82 +704,24 @@ define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
}
define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_4i16_to_4f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl 6(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 4(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_4i16_to_4f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl 6(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 4(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_4i16_to_4f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_4i16_to_4f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_4i16_to_4f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: retq
%1 = load <4 x i16>, <4 x i16>* %a0
%2 = bitcast <4 x i16> %1 to <4 x half>
%3 = fpext <4 x half> %2 to <4 x float>
@@ -1046,19 +793,18 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
; AVX512F-NEXT: shrq $48, %rdx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_8i16_to_4f32:
@@ -1096,145 +842,40 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
}
define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_8i16_to_8f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl 6(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 4(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: movswl 14(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm4
-; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX1-NEXT: movswl 12(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm5
-; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT: movswl 8(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm6
-; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX1-NEXT: movswl 10(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm7
-; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_8i16_to_8f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl 6(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 4(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: movswl 14(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm4
-; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX2-NEXT: movswl 12(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT: movswl 8(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm6
-; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX2-NEXT: movswl 10(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm7
-; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_8i16_to_8f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl 14(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: movswl 12(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl 8(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: movswl 10(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_8i16_to_8f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl 14(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: movswl 12(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl 8(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: movswl 10(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_8i16_to_8f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: movswl 14(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm4
+; ALL-NEXT: vcvtph2ps %xmm4, %xmm4
+; ALL-NEXT: movswl 12(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm5
+; ALL-NEXT: vcvtph2ps %xmm5, %xmm5
+; ALL-NEXT: movswl 8(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm6
+; ALL-NEXT: vcvtph2ps %xmm6, %xmm6
+; ALL-NEXT: movswl 10(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm7
+; ALL-NEXT: vcvtph2ps %xmm7, %xmm7
+; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; ALL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = bitcast <8 x i16> %1 to <8 x half>
%3 = fpext <8 x half> %2 to <8 x float>
@@ -1378,65 +1019,65 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
; AVX512F: # BB#0:
; AVX512F-NEXT: movswl 6(%rdi), %eax
; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm16
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm8
; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm17
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm9
; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm10
; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm11
; AVX512F-NEXT: movswl 14(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm12
; AVX512F-NEXT: movswl 12(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm13
; AVX512F-NEXT: movswl 8(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm14
; AVX512F-NEXT: movswl 10(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm15
; AVX512F-NEXT: movswl 22(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm8
-; AVX512F-NEXT: vcvtph2ps %ymm8, %zmm8
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl 20(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm9
-; AVX512F-NEXT: vcvtph2ps %ymm9, %zmm9
+; AVX512F-NEXT: vmovd %eax, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl 16(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm10
-; AVX512F-NEXT: vcvtph2ps %ymm10, %zmm10
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: movswl 18(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm11
-; AVX512F-NEXT: vcvtph2ps %ymm11, %zmm11
+; AVX512F-NEXT: vmovd %eax, %xmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: movswl 30(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm12
-; AVX512F-NEXT: vcvtph2ps %ymm12, %zmm12
+; AVX512F-NEXT: vmovd %eax, %xmm4
+; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4
; AVX512F-NEXT: movswl 28(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm13
-; AVX512F-NEXT: vcvtph2ps %ymm13, %zmm13
+; AVX512F-NEXT: vmovd %eax, %xmm5
+; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5
; AVX512F-NEXT: movswl 24(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm14
-; AVX512F-NEXT: vcvtph2ps %ymm14, %zmm14
+; AVX512F-NEXT: vmovd %eax, %xmm6
+; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
; AVX512F-NEXT: movswl 26(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm15
-; AVX512F-NEXT: vcvtph2ps %ymm15, %zmm15
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm13[0],xmm0[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm12[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0],xmm1[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; AVX512F-NEXT: vmovd %eax, %xmm7
+; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm17[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm16[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
@@ -1518,38 +1159,13 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
;
define double @cvt_i16_to_f64(i16 %a0) nounwind {
-; AVX1-LABEL: cvt_i16_to_f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl %di, %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_i16_to_f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl %di, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_i16_to_f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_i16_to_f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_i16_to_f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = bitcast i16 %a0 to half
%2 = fpext half %1 to double
ret double %2
@@ -1599,13 +1215,12 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_2i16_to_2f64:
@@ -1701,15 +1316,15 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
@@ -1791,13 +1406,12 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8i16_to_2f64:
@@ -1892,15 +1506,15 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
@@ -1950,25 +1564,25 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_8i16_to_8f64:
; AVX1: # BB#0:
; AVX1-NEXT: vmovq %xmm0, %rdx
-; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movq %rdx, %r9
; AVX1-NEXT: movl %edx, %r10d
-; AVX1-NEXT: movswl %dx, %r9d
+; AVX1-NEXT: movswl %dx, %r8d
; AVX1-NEXT: shrq $48, %rdx
-; AVX1-NEXT: shrq $32, %r8
+; AVX1-NEXT: shrq $32, %r9
; AVX1-NEXT: shrl $16, %r10d
; AVX1-NEXT: vpextrq $1, %xmm0, %rdi
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: movl %edi, %esi
+; AVX1-NEXT: movq %rdi, %rsi
+; AVX1-NEXT: movl %edi, %eax
; AVX1-NEXT: movswl %di, %ecx
; AVX1-NEXT: shrq $48, %rdi
-; AVX1-NEXT: shrq $32, %rax
-; AVX1-NEXT: shrl $16, %esi
-; AVX1-NEXT: movswl %si, %esi
-; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: shrq $32, %rsi
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1
; AVX1-NEXT: vmovd %ecx, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX1-NEXT: cwtl
+; AVX1-NEXT: movswl %si, %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3
; AVX1-NEXT: movswl %di, %eax
@@ -1977,9 +1591,9 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX1-NEXT: movswl %r10w, %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: vmovd %r9d, %xmm5
+; AVX1-NEXT: vmovd %r8d, %xmm5
; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT: movswl %r8w, %eax
+; AVX1-NEXT: movswl %r9w, %eax
; AVX1-NEXT: vmovd %eax, %xmm6
; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
; AVX1-NEXT: movswl %dx, %eax
@@ -2004,25 +1618,25 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX2-LABEL: cvt_8i16_to_8f64:
; AVX2: # BB#0:
; AVX2-NEXT: vmovq %xmm0, %rdx
-; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movq %rdx, %r9
; AVX2-NEXT: movl %edx, %r10d
-; AVX2-NEXT: movswl %dx, %r9d
+; AVX2-NEXT: movswl %dx, %r8d
; AVX2-NEXT: shrq $48, %rdx
-; AVX2-NEXT: shrq $32, %r8
+; AVX2-NEXT: shrq $32, %r9
; AVX2-NEXT: shrl $16, %r10d
; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: movl %edi, %esi
+; AVX2-NEXT: movq %rdi, %rsi
+; AVX2-NEXT: movl %edi, %eax
; AVX2-NEXT: movswl %di, %ecx
; AVX2-NEXT: shrq $48, %rdi
-; AVX2-NEXT: shrq $32, %rax
-; AVX2-NEXT: shrl $16, %esi
-; AVX2-NEXT: movswl %si, %esi
-; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: shrq $32, %rsi
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
; AVX2-NEXT: vmovd %ecx, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX2-NEXT: cwtl
+; AVX2-NEXT: movswl %si, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
; AVX2-NEXT: movswl %di, %eax
@@ -2031,9 +1645,9 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX2-NEXT: movswl %r10w, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vmovd %r9d, %xmm5
+; AVX2-NEXT: vmovd %r8d, %xmm5
; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT: movswl %r8w, %eax
+; AVX2-NEXT: movswl %r9w, %eax
; AVX2-NEXT: vmovd %eax, %xmm6
; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
; AVX2-NEXT: movswl %dx, %eax
@@ -2055,115 +1669,60 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: cvt_8i16_to_8f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT: movq %rdx, %r8
-; AVX512F-NEXT: movl %edx, %r9d
-; AVX512F-NEXT: movswl %dx, %r10d
-; AVX512F-NEXT: shrq $48, %rdx
-; AVX512F-NEXT: shrq $32, %r8
-; AVX512F-NEXT: shrl $16, %r9d
-; AVX512F-NEXT: vmovq %xmm0, %rdi
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movl %edi, %ecx
-; AVX512F-NEXT: movswl %di, %esi
-; AVX512F-NEXT: shrq $48, %rdi
-; AVX512F-NEXT: shrq $32, %rax
-; AVX512F-NEXT: shrl $16, %ecx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl %r9w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: vmovd %r10d, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl %r8w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: movswl %dx, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512F-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_8i16_to_8f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512VL-NEXT: movq %rdx, %r8
-; AVX512VL-NEXT: movl %edx, %r10d
-; AVX512VL-NEXT: movswl %dx, %r9d
-; AVX512VL-NEXT: shrq $48, %rdx
-; AVX512VL-NEXT: shrq $32, %r8
-; AVX512VL-NEXT: shrl $16, %r10d
-; AVX512VL-NEXT: vmovq %xmm0, %rdi
-; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movl %edi, %esi
-; AVX512VL-NEXT: movswl %di, %ecx
-; AVX512VL-NEXT: shrq $48, %rdi
-; AVX512VL-NEXT: shrq $32, %rax
-; AVX512VL-NEXT: shrl $16, %esi
-; AVX512VL-NEXT: movswl %si, %esi
-; AVX512VL-NEXT: vmovd %esi, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %ecx, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: cwtl
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl %r10w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: vmovd %r9d, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl %r8w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: movswl %dx, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: cvt_8i16_to_8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT: movq %rdx, %r9
+; AVX512-NEXT: movl %edx, %r10d
+; AVX512-NEXT: movswl %dx, %r8d
+; AVX512-NEXT: shrq $48, %rdx
+; AVX512-NEXT: shrq $32, %r9
+; AVX512-NEXT: shrl $16, %r10d
+; AVX512-NEXT: vmovq %xmm0, %rdi
+; AVX512-NEXT: movq %rdi, %rsi
+; AVX512-NEXT: movl %edi, %eax
+; AVX512-NEXT: movswl %di, %ecx
+; AVX512-NEXT: shrq $48, %rdi
+; AVX512-NEXT: shrq $32, %rsi
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovd %ecx, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl %si, %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl %di, %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl %r10w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: vmovd %r8d, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl %r9w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl %dx, %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
%2 = fpext <8 x half> %1 to <8 x double>
ret <8 x double> %2
@@ -2174,38 +1733,13 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
;
define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
-; AVX1-LABEL: load_cvt_i16_to_f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_i16_to_f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_i16_to_f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_i16_to_f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_i16_to_f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = load i16, i16* %a0
%2 = bitcast i16 %1 to half
%3 = fpext half %2 to double
@@ -2213,58 +1747,18 @@ define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
}
define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_2i16_to_2f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_2i16_to_2f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_2i16_to_2f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_2i16_to_2f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_2i16_to_2f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; ALL-NEXT: retq
%1 = load <2 x i16>, <2 x i16>* %a0
%2 = bitcast <2 x i16> %1 to <2 x half>
%3 = fpext <2 x half> %2 to <2 x double>
@@ -2272,97 +1766,28 @@ define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
}
define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_4i16_to_4f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl 4(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: movswl 6(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_4i16_to_4f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl 4(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: movswl 6(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_4i16_to_4f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_4i16_to_4f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_4i16_to_4f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: retq
%1 = load <4 x i16>, <4 x i16>* %a0
%2 = bitcast <4 x i16> %1 to <4 x half>
%3 = fpext <4 x half> %2 to <4 x double>
@@ -2439,15 +1864,15 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
@@ -2579,91 +2004,48 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: load_cvt_8i16_to_8f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl 8(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: movswl 10(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl 12(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: movswl 14(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512F-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_8i16_to_8f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl 8(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: movswl 10(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl 12(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: movswl 14(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: load_cvt_8i16_to_8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: movswl (%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: movswl 2(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl 4(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl 6(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl 8(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: movswl 10(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl 12(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl 14(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = bitcast <8 x i16> %1 to <8 x half>
%3 = fpext <8 x half> %2 to <8 x double>
@@ -2675,138 +2057,41 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
;
define i16 @cvt_f32_to_i16(float %a0) nounwind {
-; AVX1-LABEL: cvt_f32_to_i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_f32_to_i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_f32_to_i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_f32_to_i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_f32_to_i16:
+; ALL: # BB#0:
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ALL-NEXT: retq
%1 = fptrunc float %a0 to half
%2 = bitcast half %1 to i16
ret i16 %2
}
define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind {
-; AVX1-LABEL: cvt_4f32_to_4i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: movzwl %cx, %ecx
-; AVX1-NEXT: orl %eax, %ecx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: movzwl %dx, %edx
-; AVX1-NEXT: orl %eax, %edx
-; AVX1-NEXT: shlq $32, %rdx
-; AVX1-NEXT: orq %rcx, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_4f32_to_4i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: movzwl %cx, %ecx
-; AVX2-NEXT: orl %eax, %ecx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: movzwl %dx, %edx
-; AVX2-NEXT: orl %eax, %edx
-; AVX2-NEXT: shlq $32, %rdx
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_4f32_to_4i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
-; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
-; AVX512F-NEXT: orl %eax, %edx
-; AVX512F-NEXT: shlq $32, %rdx
-; AVX512F-NEXT: orq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_4f32_to_4i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: movzwl %cx, %ecx
-; AVX512VL-NEXT: orl %eax, %ecx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %edx
-; AVX512VL-NEXT: movzwl %dx, %edx
-; AVX512VL-NEXT: orl %eax, %edx
-; AVX512VL-NEXT: shlq $32, %rdx
-; AVX512VL-NEXT: orq %rcx, %rdx
-; AVX512VL-NEXT: vmovq %rdx, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_4f32_to_4i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
ret <4 x i16> %2
@@ -2865,29 +2150,27 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
;
; AVX512F-LABEL: cvt_4f32_to_8i16_undef:
; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_8i16_undef:
@@ -2974,29 +2257,27 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
;
; AVX512F-LABEL: cvt_4f32_to_8i16_zero:
; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
@@ -3033,194 +2314,52 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
}
define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
-; AVX1-LABEL: cvt_8f32_to_8i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: movzwl %cx, %ecx
-; AVX1-NEXT: orl %eax, %ecx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %edx
-; AVX1-NEXT: shll $16, %edx
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: movzwl %ax, %eax
-; AVX1-NEXT: orl %edx, %eax
-; AVX1-NEXT: shlq $32, %rax
-; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: shll $16, %ecx
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %edx
-; AVX1-NEXT: movzwl %dx, %edx
-; AVX1-NEXT: orl %ecx, %edx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: shll $16, %ecx
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %esi
-; AVX1-NEXT: movzwl %si, %esi
-; AVX1-NEXT: orl %ecx, %esi
-; AVX1-NEXT: shlq $32, %rsi
-; AVX1-NEXT: orq %rdx, %rsi
-; AVX1-NEXT: vmovq %rsi, %xmm0
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_8f32_to_8i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: movzwl %cx, %ecx
-; AVX2-NEXT: orl %eax, %ecx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: shll $16, %edx
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: orl %edx, %eax
-; AVX2-NEXT: shlq $32, %rax
-; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: shll $16, %ecx
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: movzwl %dx, %edx
-; AVX2-NEXT: orl %ecx, %edx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: shll $16, %ecx
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %esi
-; AVX2-NEXT: movzwl %si, %esi
-; AVX2-NEXT: orl %ecx, %esi
-; AVX2-NEXT: shlq $32, %rsi
-; AVX2-NEXT: orq %rdx, %rsi
-; AVX2-NEXT: vmovq %rsi, %xmm0
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_8f32_to_8i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
-; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %edx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: shll $16, %eax
-; AVX512F-NEXT: orl %edx, %eax
-; AVX512F-NEXT: shlq $32, %rax
-; AVX512F-NEXT: orq %rcx, %rax
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: movzwl %cx, %ecx
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %edx
-; AVX512F-NEXT: shll $16, %edx
-; AVX512F-NEXT: orl %ecx, %edx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: movzwl %cx, %ecx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %esi
-; AVX512F-NEXT: shll $16, %esi
-; AVX512F-NEXT: orl %ecx, %esi
-; AVX512F-NEXT: shlq $32, %rsi
-; AVX512F-NEXT: orq %rdx, %rsi
-; AVX512F-NEXT: vmovq %rsi, %xmm0
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_8f32_to_8i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: movzwl %cx, %ecx
-; AVX512VL-NEXT: orl %eax, %ecx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %edx
-; AVX512VL-NEXT: shll $16, %edx
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: movzwl %ax, %eax
-; AVX512VL-NEXT: orl %edx, %eax
-; AVX512VL-NEXT: shlq $32, %rax
-; AVX512VL-NEXT: orq %rcx, %rax
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: shll $16, %ecx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %edx
-; AVX512VL-NEXT: movzwl %dx, %edx
-; AVX512VL-NEXT: orl %ecx, %edx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: shll $16, %ecx
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %esi
-; AVX512VL-NEXT: movzwl %si, %esi
-; AVX512VL-NEXT: orl %ecx, %esi
-; AVX512VL-NEXT: shlq $32, %rsi
-; AVX512VL-NEXT: orq %rdx, %rsi
-; AVX512VL-NEXT: vmovq %rsi, %xmm0
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_8f32_to_8i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %edx
+; ALL-NEXT: shll $16, %edx
+; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: movzwl %ax, %eax
+; ALL-NEXT: orl %edx, %eax
+; ALL-NEXT: shlq $32, %rax
+; ALL-NEXT: orq %rcx, %rax
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: shll $16, %ecx
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %ecx, %edx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: shll $16, %ecx
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %esi
+; ALL-NEXT: movzwl %si, %esi
+; ALL-NEXT: orl %ecx, %esi
+; ALL-NEXT: shlq $32, %rsi
+; ALL-NEXT: orq %rdx, %rsi
+; ALL-NEXT: vmovq %rsi, %xmm0
+; ALL-NEXT: vmovq %rax, %xmm1
+; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%1 = fptrunc <8 x float> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
ret <8 x i16> %2
@@ -3361,141 +2500,73 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: cvt_16f32_to_16i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm1
-; AVX512F-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm0
-; AVX512F-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_16f32_to_16i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm1
-; AVX512VL-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX512VL-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
-; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: cvt_16f32_to_16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: retq
%1 = fptrunc <16 x float> %a0 to <16 x half>
%2 = bitcast <16 x half> %1 to <16 x i16>
ret <16 x i16> %2
@@ -3506,35 +2577,12 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
;
define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
-; AVX1-LABEL: store_cvt_f32_to_i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: movw %ax, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: store_cvt_f32_to_i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: movw %ax, (%rdi)
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: store_cvt_f32_to_i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: movw %ax, (%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_f32_to_i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: movw %ax, (%rdi)
-; AVX512VL-NEXT: retq
+; ALL-LABEL: store_cvt_f32_to_i16:
+; ALL: # BB#0:
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: movw %ax, (%rdi)
+; ALL-NEXT: retq
%1 = fptrunc float %a0 to half
%2 = bitcast half %1 to i16
store i16 %2, i16* %a1
@@ -3542,83 +2590,24 @@ define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
}
define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind {
-; AVX1-LABEL: store_cvt_4f32_to_4i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %edx
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %esi
-; AVX1-NEXT: movw %si, (%rdi)
-; AVX1-NEXT: movw %dx, 6(%rdi)
-; AVX1-NEXT: movw %cx, 4(%rdi)
-; AVX1-NEXT: movw %ax, 2(%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: store_cvt_4f32_to_4i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %esi
-; AVX2-NEXT: movw %si, (%rdi)
-; AVX2-NEXT: movw %dx, 6(%rdi)
-; AVX2-NEXT: movw %cx, 4(%rdi)
-; AVX2-NEXT: movw %ax, 2(%rdi)
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: store_cvt_4f32_to_4i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %edx
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %esi
-; AVX512F-NEXT: movw %si, (%rdi)
-; AVX512F-NEXT: movw %dx, 6(%rdi)
-; AVX512F-NEXT: movw %cx, 4(%rdi)
-; AVX512F-NEXT: movw %ax, 2(%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_4f32_to_4i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %edx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %esi
-; AVX512VL-NEXT: movw %si, (%rdi)
-; AVX512VL-NEXT: movw %dx, 6(%rdi)
-; AVX512VL-NEXT: movw %cx, 4(%rdi)
-; AVX512VL-NEXT: movw %ax, 2(%rdi)
-; AVX512VL-NEXT: retq
+; ALL-LABEL: store_cvt_4f32_to_4i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %edx
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %esi
+; ALL-NEXT: movw %si, (%rdi)
+; ALL-NEXT: movw %dx, 6(%rdi)
+; ALL-NEXT: movw %cx, 4(%rdi)
+; ALL-NEXT: movw %ax, 2(%rdi)
+; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
store <4 x i16> %2, <4 x i16>* %a1
@@ -3680,30 +2669,28 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw
;
; AVX512F-LABEL: store_cvt_4f32_to_8i16_undef:
; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f32_to_8i16_undef:
@@ -3794,30 +2781,28 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi
;
; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero:
; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero:
@@ -3856,150 +2841,41 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi
}
define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind {
-; AVX1-LABEL: store_cvt_8f32_to_8i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %r8d
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %r9d
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %r10d
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovd %xmm2, %r11d
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovd %xmm2, %eax
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovd %xmm2, %ecx
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %esi
-; AVX1-NEXT: movw %si, 8(%rdi)
-; AVX1-NEXT: movw %dx, (%rdi)
-; AVX1-NEXT: movw %cx, 14(%rdi)
-; AVX1-NEXT: movw %ax, 12(%rdi)
-; AVX1-NEXT: movw %r11w, 10(%rdi)
-; AVX1-NEXT: movw %r10w, 6(%rdi)
-; AVX1-NEXT: movw %r9w, 4(%rdi)
-; AVX1-NEXT: movw %r8w, 2(%rdi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: store_cvt_8f32_to_8i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %r8d
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %r9d
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %r10d
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT: vmovd %xmm2, %r11d
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT: vmovd %xmm2, %eax
-; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT: vmovd %xmm2, %ecx
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %esi
-; AVX2-NEXT: movw %si, 8(%rdi)
-; AVX2-NEXT: movw %dx, (%rdi)
-; AVX2-NEXT: movw %cx, 14(%rdi)
-; AVX2-NEXT: movw %ax, 12(%rdi)
-; AVX2-NEXT: movw %r11w, 10(%rdi)
-; AVX2-NEXT: movw %r10w, 6(%rdi)
-; AVX2-NEXT: movw %r9w, 4(%rdi)
-; AVX2-NEXT: movw %r8w, 2(%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: store_cvt_8f32_to_8i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %r8d
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %r9d
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %r10d
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %r11d
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %ecx
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %esi
-; AVX512F-NEXT: movw %si, 8(%rdi)
-; AVX512F-NEXT: movw %dx, (%rdi)
-; AVX512F-NEXT: movw %cx, 14(%rdi)
-; AVX512F-NEXT: movw %ax, 12(%rdi)
-; AVX512F-NEXT: movw %r11w, 10(%rdi)
-; AVX512F-NEXT: movw %r10w, 6(%rdi)
-; AVX512F-NEXT: movw %r9w, 4(%rdi)
-; AVX512F-NEXT: movw %r8w, 2(%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_8f32_to_8i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %r8d
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %r9d
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %r10d
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %r11d
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %ecx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %edx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %esi
-; AVX512VL-NEXT: movw %si, 8(%rdi)
-; AVX512VL-NEXT: movw %dx, (%rdi)
-; AVX512VL-NEXT: movw %cx, 14(%rdi)
-; AVX512VL-NEXT: movw %ax, 12(%rdi)
-; AVX512VL-NEXT: movw %r11w, 10(%rdi)
-; AVX512VL-NEXT: movw %r10w, 6(%rdi)
-; AVX512VL-NEXT: movw %r9w, 4(%rdi)
-; AVX512VL-NEXT: movw %r8w, 2(%rdi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; ALL-LABEL: store_cvt_8f32_to_8i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %r8d
+; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %r9d
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %r10d
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; ALL-NEXT: vmovd %xmm2, %r11d
+; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; ALL-NEXT: vmovd %xmm2, %eax
+; ALL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; ALL-NEXT: vmovd %xmm2, %ecx
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; ALL-NEXT: vmovd %xmm0, %esi
+; ALL-NEXT: movw %si, 8(%rdi)
+; ALL-NEXT: movw %dx, (%rdi)
+; ALL-NEXT: movw %cx, 14(%rdi)
+; ALL-NEXT: movw %ax, 12(%rdi)
+; ALL-NEXT: movw %r11w, 10(%rdi)
+; ALL-NEXT: movw %r10w, 6(%rdi)
+; ALL-NEXT: movw %r9w, 4(%rdi)
+; ALL-NEXT: movw %r8w, 2(%rdi)
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%1 = fptrunc <8 x float> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
store <8 x i16> %2, <8 x i16>* %a1
@@ -4141,141 +3017,73 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: store_cvt_16f32_to_16i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm4
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm4
-; AVX512F-NEXT: movw %ax, 24(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm4
-; AVX512F-NEXT: movw %ax, 16(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm4
-; AVX512F-NEXT: movw %ax, 8(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4
-; AVX512F-NEXT: movw %ax, (%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4
-; AVX512F-NEXT: movw %ax, 30(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: movw %ax, 28(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: movw %ax, 26(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: movw %ax, 22(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: movw %ax, 20(%rdi)
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: movw %ax, 18(%rdi)
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: movw %ax, 14(%rdi)
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movw %ax, 12(%rdi)
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: movw %ax, 10(%rdi)
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: movw %ax, 6(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: movw %ax, 4(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: movw %ax, 2(%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_16f32_to_16i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm4
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm4
-; AVX512VL-NEXT: movw %ax, 24(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm4
-; AVX512VL-NEXT: movw %ax, 16(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm4
-; AVX512VL-NEXT: movw %ax, 8(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; AVX512VL-NEXT: movw %ax, (%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; AVX512VL-NEXT: movw %ax, 30(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: movw %ax, 28(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: movw %ax, 26(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: movw %ax, 22(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: movw %ax, 20(%rdi)
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: movw %ax, 18(%rdi)
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: movw %ax, 14(%rdi)
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: movw %ax, 12(%rdi)
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: movw %ax, 10(%rdi)
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: movw %ax, 6(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: movw %ax, 4(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: movw %ax, 2(%rdi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: store_cvt_16f32_to_16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4
+; AVX512-NEXT: movw %ax, 24(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4
+; AVX512-NEXT: movw %ax, 16(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4
+; AVX512-NEXT: movw %ax, 8(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: movw %ax, (%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: movw %ax, 30(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 28(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 26(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 22(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: movw %ax, 20(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: movw %ax, 18(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: movw %ax, 14(%rdi)
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: movw %ax, 12(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: movw %ax, 10(%rdi)
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: movw %ax, 6(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: movw %ax, 4(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: movw %ax, 2(%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = fptrunc <16 x float> %a0 to <16 x half>
%2 = bitcast <16 x half> %1 to <16 x i16>
store <16 x i16> %2, <16 x i16>* %a1
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index cd4b237735f..25377f26799 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -3333,11 +3333,17 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; AVX1-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: .cfi_def_cfa_offset 48
; AVX1-NEXT: popq %r12
+; AVX1-NEXT: .cfi_def_cfa_offset 40
; AVX1-NEXT: popq %r13
+; AVX1-NEXT: .cfi_def_cfa_offset 32
; AVX1-NEXT: popq %r14
+; AVX1-NEXT: .cfi_def_cfa_offset 24
; AVX1-NEXT: popq %r15
+; AVX1-NEXT: .cfi_def_cfa_offset 16
; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: .cfi_def_cfa_offset 8
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_16i1_to_16i16:
@@ -3424,11 +3430,17 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; AVX2-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: .cfi_def_cfa_offset 48
; AVX2-NEXT: popq %r12
+; AVX2-NEXT: .cfi_def_cfa_offset 40
; AVX2-NEXT: popq %r13
+; AVX2-NEXT: .cfi_def_cfa_offset 32
; AVX2-NEXT: popq %r14
+; AVX2-NEXT: .cfi_def_cfa_offset 24
; AVX2-NEXT: popq %r15
+; AVX2-NEXT: .cfi_def_cfa_offset 16
; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: .cfi_def_cfa_offset 8
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_sext_16i1_to_16i16:
@@ -4824,6 +4836,7 @@ define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0
; X32-SSE41-NEXT: movd %xmm0, %eax
; X32-SSE41-NEXT: popl %ecx
+; X32-SSE41-NEXT: .cfi_def_cfa_offset 4
; X32-SSE41-NEXT: retl
entry:
%Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index dd329d21dc9..7ef5bee5420 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -3963,10 +3963,20 @@ define <16 x i16> @concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31(<16 x i
}
define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc(<16 x i16> %a, <16 x i16> %b) {
-; ALL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
-; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT: retq
%ahi = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%bc0hi = bitcast <8 x i16> %ahi to <16 x i8>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index cf1aaca4ee2..56567c7e794 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1053,8 +1053,8 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-LABEL: shuffle_v4i64_3254:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4>
ret <4 x i64> %shuffle
@@ -1075,8 +1075,8 @@ define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-LABEL: shuffle_v4i64_3276:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6>
ret <4 x i64> %shuffle
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index b95e7cf008a..e4234c05845 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -1789,21 +1789,33 @@ define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) {
-; ALL-LABEL: shuffle_v8i32_7654fedc:
-; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i32_7654fedc:
+; AVX1OR2: # BB#0:
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_7654fedc:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) {
-; ALL-LABEL: shuffle_v8i32_fedc7654:
-; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i32_fedc7654:
+; AVX1OR2: # BB#0:
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_fedc7654:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
}
@@ -2177,10 +2189,15 @@ define <8 x i32> @concat_v8i32_0123CDEF(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @concat_v8i32_4567CDEF_bc(<8 x i32> %a0, <8 x i32> %a1) {
-; ALL-LABEL: concat_v8i32_4567CDEF_bc:
-; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: concat_v8i32_4567CDEF_bc:
+; AVX1OR2: # BB#0:
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: concat_v8i32_4567CDEF_bc:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT: retq
%a0hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%a1hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
%bc0hi = bitcast <4 x i32> %a0hi to <2 x i64>
diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 6c980559721..1d17ef109d2 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -1165,14 +1165,31 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_01014545:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_01014545:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-32-NEXT: retl
+
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01014545_mem(<8 x i64>* %ptr, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8i64_01014545_mem:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_01014545_mem:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5]
; AVX512F-32-NEXT: retl
+ %a = load <8 x i64>, <8 x i64>* %ptr
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x i64> %shuffle
}
diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll
index efbe5586747..b107b60cd6d 100644
--- a/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -619,6 +619,7 @@ define <64 x i8> @test_mm512_mask_blend_epi8(<64 x i8> %A, <64 x i8> %W){
; KNL32-NEXT: vpblendvb %ymm3, 8(%ebp), %ymm1, %ymm1
; KNL32-NEXT: movl %ebp, %esp
; KNL32-NEXT: popl %ebp
+; KNL32-NEXT: .cfi_def_cfa %esp, 4
; KNL32-NEXT: retl
entry:
%0 = shufflevector <64 x i8> %A, <64 x i8> %W, <64 x i32> <i32 64, i32 1, i32 66, i32 3, i32 68, i32 5, i32 70, i32 7, i32 72, i32 9, i32 74, i32 11, i32 76, i32 13, i32 78, i32 15, i32 80, i32 17, i32 82, i32 19, i32 84, i32 21, i32 86, i32 23, i32 88, i32 25, i32 90, i32 27, i32 92, i32 29, i32 94, i32 31, i32 96, i32 33, i32 98, i32 35, i32 100, i32 37, i32 102, i32 39, i32 104, i32 41, i32 106, i32 43, i32 108, i32 45, i32 110, i32 47, i32 112, i32 49, i32 114, i32 51, i32 116, i32 53, i32 118, i32 55, i32 120, i32 57, i32 122, i32 59, i32 124, i32 61, i32 126, i32 63>
@@ -659,6 +660,7 @@ define <32 x i16> @test_mm512_mask_blend_epi16(<32 x i16> %A, <32 x i16> %W){
; KNL32-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15]
; KNL32-NEXT: movl %ebp, %esp
; KNL32-NEXT: popl %ebp
+; KNL32-NEXT: .cfi_def_cfa %esp, 4
; KNL32-NEXT: retl
entry:
%0 = shufflevector <32 x i16> %A, <32 x i16> %W, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll
index 8d057290085..0e690347a54 100644
--- a/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -630,6 +630,7 @@ define i64 @shuf64i1_zero(i64 %a) {
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -662,6 +663,7 @@ define i64 @shuf64i1_zero(i64 %a) {
; AVX512VL-NEXT: orq %rcx, %rax
; AVX512VL-NEXT: movq %rbp, %rsp
; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: .cfi_def_cfa %rsp, 8
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index dc08d88074d..ac1083ad447 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -813,13 +813,10 @@ define void @trunc16i32_16i16_lshr(<16 x i32> %a) {
;
; AVX2-LABEL: trunc16i32_16i16_lshr:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -947,28 +944,52 @@ entry:
}
define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
-; SSE-LABEL: trunc16i32_16i8_lshr:
-; SSE: # BB#0: # %entry
-; SSE-NEXT: psrld $24, %xmm1
-; SSE-NEXT: psrld $24, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: psrld $24, %xmm3
-; SSE-NEXT: psrld $24, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: movdqu %xmm0, (%rax)
-; SSE-NEXT: retq
+; SSE2-LABEL: trunc16i32_16i8_lshr:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: psrld $24, %xmm1
+; SSE2-NEXT: psrld $24, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: psrld $24, %xmm3
+; SSE2-NEXT: psrld $24, %xmm2
+; SSE2-NEXT: packuswb %xmm3, %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: movdqu %xmm0, (%rax)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: trunc16i32_16i8_lshr:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: psrld $24, %xmm1
+; SSSE3-NEXT: psrld $24, %xmm0
+; SSSE3-NEXT: packuswb %xmm1, %xmm0
+; SSSE3-NEXT: psrld $24, %xmm3
+; SSSE3-NEXT: psrld $24, %xmm2
+; SSSE3-NEXT: packuswb %xmm3, %xmm2
+; SSSE3-NEXT: packuswb %xmm2, %xmm0
+; SSSE3-NEXT: movdqu %xmm0, (%rax)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: trunc16i32_16i8_lshr:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: psrld $24, %xmm1
+; SSE41-NEXT: psrld $24, %xmm0
+; SSE41-NEXT: packssdw %xmm1, %xmm0
+; SSE41-NEXT: psrld $24, %xmm3
+; SSE41-NEXT: psrld $24, %xmm2
+; SSE41-NEXT: packssdw %xmm3, %xmm2
+; SSE41-NEXT: packuswb %xmm2, %xmm0
+; SSE41-NEXT: movdqu %xmm0, (%rax)
+; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc16i32_16i8_lshr:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
; AVX1-NEXT: vzeroupper
@@ -976,16 +997,12 @@ define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
;
; AVX2-LABEL: trunc16i32_16i8_lshr:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0
+; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/test/CodeGen/X86/wide-integer-cmp.ll b/test/CodeGen/X86/wide-integer-cmp.ll
index 97460b36a74..9bd53c6fbd3 100644
--- a/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/test/CodeGen/X86/wide-integer-cmp.ll
@@ -105,10 +105,13 @@ define i32 @test_wide(i128 %a, i128 %b) {
; CHECK-NEXT: # BB#1: # %bb1
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB4_2: # %bb2
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: movl $2, %eax
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
entry:
%cmp = icmp slt i128 %a, %b
diff --git a/test/CodeGen/X86/x86-framelowering-trap.ll b/test/CodeGen/X86/x86-framelowering-trap.ll
index f1590abcae8..89f4528fb06 100644
--- a/test/CodeGen/X86/x86-framelowering-trap.ll
+++ b/test/CodeGen/X86/x86-framelowering-trap.ll
@@ -6,6 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK: pushq
; CHECK: ud2
; CHECK-NEXT: popq
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
define void @bar() {
entry:
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index acad9f771fc..bc6a6ea205c 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1816,6 +1816,7 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
; AVX1-NEXT: vmovaps %ymm9, 64(%rdi)
; AVX1-NEXT: vmovaps %ymm8, (%rdi)
; AVX1-NEXT: addq $24, %rsp
+; AVX1-NEXT: .cfi_def_cfa_offset 8
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
diff --git a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
index 763d764698d..929dafbfc21 100644
--- a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
+++ b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
@@ -20,6 +20,7 @@ define x86_64_sysvcc i32 @bar(i32 %a0, i32 %a1, float %b0) #0 {
; CHECK-NEXT: movl $4, %eax
; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: popq %rdx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
call void asm sideeffect "", "~{rax},~{rdx},~{xmm1},~{rdi},~{rsi},~{xmm0}"()
ret i32 4
diff --git a/test/DebugInfo/AArch64/inlined-argument.ll b/test/DebugInfo/AArch64/inlined-argument.ll
new file mode 100644
index 00000000000..868efc28f6a
--- /dev/null
+++ b/test/DebugInfo/AArch64/inlined-argument.ll
@@ -0,0 +1,140 @@
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump --name resource - | FileCheck %s
+; CHECK: DW_TAG_formal_parameter
+; CHECK: DW_TAG_formal_parameter
+; CHECK-NEXT: DW_AT_location (DW_OP_reg1 W1)
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}}"resource"
+;
+; Generated from:
+; typedef struct t *t_t;
+; extern unsigned int enable;
+; struct t {
+; struct q {
+; struct q *next;
+; unsigned long long resource;
+; } * s;
+; } * tt;
+; static unsigned long find(t_t t, unsigned long long resource) {
+; struct q *q;
+; q = t->s;
+; while (q) {
+; if (q->resource == resource)
+; return q;
+; q = q->next;
+; }
+; }
+; int g(t_t t, unsigned long long r) {
+; struct q *q;
+; q = find(t, r);
+; if (!q)
+; if (__builtin_expect(enable, 0)) { }
+; }
+
+
+source_filename = "test.i"
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios5.0.0"
+
+%struct.t = type { %struct.q* }
+%struct.q = type { %struct.q*, i64 }
+
+@tt = local_unnamed_addr global %struct.t* null, align 8, !dbg !0
+
+; Function Attrs: noredzone nounwind readonly ssp
+define i32 @g(%struct.t* nocapture readonly %t, i64 %r) local_unnamed_addr #0 !dbg !20 {
+entry:
+ tail call void @llvm.dbg.value(metadata %struct.t* %t, metadata !26, metadata !DIExpression()), !dbg !29
+ tail call void @llvm.dbg.value(metadata i64 %r, metadata !27, metadata !DIExpression()), !dbg !30
+ tail call void @llvm.dbg.value(metadata %struct.t* %t, metadata !31, metadata !DIExpression()), !dbg !39
+ tail call void @llvm.dbg.value(metadata i64 %r, metadata !37, metadata !DIExpression()), !dbg !41
+ %s.i5 = bitcast %struct.t* %t to %struct.q**
+ tail call void @llvm.dbg.value(metadata %struct.q** %s.i5, metadata !38, metadata !DIExpression(DW_OP_deref)), !dbg !42
+ %q.06.i = load %struct.q*, %struct.q** %s.i5, align 8
+ tail call void @llvm.dbg.value(metadata %struct.q* %q.06.i, metadata !38, metadata !DIExpression()), !dbg !42
+ %tobool7.i = icmp eq %struct.q* %q.06.i, null, !dbg !43
+ br i1 %tobool7.i, label %find.exit, label %while.body.i.preheader, !dbg !43
+
+while.body.i.preheader: ; preds = %entry
+ br label %while.body.i, !dbg !44
+
+while.body.i: ; preds = %while.body.i.preheader, %if.end.i
+ %q.08.i = phi %struct.q* [ %q.0.i, %if.end.i ], [ %q.06.i, %while.body.i.preheader ]
+ %resource1.i = getelementptr inbounds %struct.q, %struct.q* %q.08.i, i64 0, i32 1, !dbg !44
+ %0 = load i64, i64* %resource1.i, align 8, !dbg !44
+ %cmp.i = icmp eq i64 %0, %r, !dbg !47
+ br i1 %cmp.i, label %find.exit, label %if.end.i, !dbg !48
+
+if.end.i: ; preds = %while.body.i
+ %next.i6 = bitcast %struct.q* %q.08.i to %struct.q**
+ tail call void @llvm.dbg.value(metadata %struct.q** %next.i6, metadata !38, metadata !DIExpression(DW_OP_deref)), !dbg !42
+ %q.0.i = load %struct.q*, %struct.q** %next.i6, align 8
+ tail call void @llvm.dbg.value(metadata %struct.q* %q.0.i, metadata !38, metadata !DIExpression()), !dbg !42
+ %tobool.i = icmp eq %struct.q* %q.0.i, null, !dbg !43
+ br i1 %tobool.i, label %find.exit, label %while.body.i, !dbg !43, !llvm.loop !49
+
+find.exit: ; preds = %while.body.i, %if.end.i, %entry
+ ret i32 undef, !dbg !52
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { noredzone nounwind readonly ssp }
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!16, !17, !18}
+!llvm.ident = !{!19}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "tt", scope: !2, file: !3, line: 8, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 6.0.0 (trunk 317516) (llvm/trunk 317518)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "test.i", directory: "/")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64)
+!7 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t", file: !3, line: 3, size: 64, elements: !8)
+!8 = !{!9}
+!9 = !DIDerivedType(tag: DW_TAG_member, name: "s", scope: !7, file: !3, line: 7, baseType: !10, size: 64)
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "q", file: !3, line: 4, size: 128, elements: !12)
+!12 = !{!13, !14}
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "next", scope: !11, file: !3, line: 5, baseType: !10, size: 64)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "resource", scope: !11, file: !3, line: 6, baseType: !15, size: 64, offset: 64)
+!15 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!16 = !{i32 2, !"Dwarf Version", i32 2}
+!17 = !{i32 2, !"Debug Info Version", i32 3}
+!18 = !{i32 1, !"wchar_size", i32 4}
+!19 = !{!"clang version 6.0.0 (trunk 317516) (llvm/trunk 317518)"}
+!20 = distinct !DISubprogram(name: "g", scope: !3, file: !3, line: 18, type: !21, isLocal: false, isDefinition: true, scopeLine: 18, flags: DIFlagPrototyped, isOptimized: true, unit: !2, variables: !25)
+!21 = !DISubroutineType(types: !22)
+!22 = !{!23, !24, !15}
+!23 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!24 = !DIDerivedType(tag: DW_TAG_typedef, name: "t_t", file: !3, line: 1, baseType: !6)
+!25 = !{!26, !27, !28}
+!26 = !DILocalVariable(name: "t", arg: 1, scope: !20, file: !3, line: 18, type: !24)
+!27 = !DILocalVariable(name: "r", arg: 2, scope: !20, file: !3, line: 18, type: !15)
+!28 = !DILocalVariable(name: "q", scope: !20, file: !3, line: 19, type: !10)
+!29 = !DILocation(line: 18, column: 11, scope: !20)
+!30 = !DILocation(line: 18, column: 33, scope: !20)
+!31 = !DILocalVariable(name: "t", arg: 1, scope: !32, file: !3, line: 9, type: !24)
+!32 = distinct !DISubprogram(name: "find", scope: !3, file: !3, line: 9, type: !33, isLocal: true, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !2, variables: !36)
+!33 = !DISubroutineType(types: !34)
+!34 = !{!35, !24, !15}
+!35 = !DIBasicType(name: "long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!36 = !{!31, !37, !38}
+!37 = !DILocalVariable(name: "resource", arg: 2, scope: !32, file: !3, line: 9, type: !15)
+!38 = !DILocalVariable(name: "q", scope: !32, file: !3, line: 10, type: !10)
+!39 = !DILocation(line: 9, column: 31, scope: !32, inlinedAt: !40)
+!40 = distinct !DILocation(line: 20, column: 7, scope: !20)
+!41 = !DILocation(line: 9, column: 53, scope: !32, inlinedAt: !40)
+!42 = !DILocation(line: 10, column: 13, scope: !32, inlinedAt: !40)
+!43 = !DILocation(line: 12, column: 3, scope: !32, inlinedAt: !40)
+!44 = !DILocation(line: 13, column: 12, scope: !45, inlinedAt: !40)
+!45 = distinct !DILexicalBlock(scope: !46, file: !3, line: 13, column: 9)
+!46 = distinct !DILexicalBlock(scope: !32, file: !3, line: 12, column: 13)
+!47 = !DILocation(line: 13, column: 21, scope: !45, inlinedAt: !40)
+!48 = !DILocation(line: 13, column: 9, scope: !46, inlinedAt: !40)
+!49 = distinct !{!49, !50, !51}
+!50 = !DILocation(line: 12, column: 3, scope: !32)
+!51 = !DILocation(line: 16, column: 3, scope: !32)
+!52 = !DILocation(line: 24, column: 1, scope: !20)
diff --git a/test/DebugInfo/ARM/illegal-fragment.ll b/test/DebugInfo/ARM/illegal-fragment.ll
new file mode 100644
index 00000000000..41e28faa708
--- /dev/null
+++ b/test/DebugInfo/ARM/illegal-fragment.ll
@@ -0,0 +1,95 @@
+; RUN: llc -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s
+; CHECK: file format Mach-O arm
+; ModuleID = 'test.ll'
+source_filename = "test.i"
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7s-apple-ios5.0.0"
+
+%struct.vm_object = type { i64 }
+
+; Function Attrs: nounwind ssp
+define void @f(%struct.vm_object* %object, i64* nocapture readonly %start) local_unnamed_addr #0 !dbg !11 {
+entry:
+ tail call void @llvm.dbg.value(metadata %struct.vm_object* %object, metadata !21, metadata !DIExpression()), !dbg !27
+ tail call void @llvm.dbg.value(metadata i64* %start, metadata !22, metadata !DIExpression()), !dbg !28
+ tail call void @llvm.dbg.value(metadata i64 %0, metadata !25, metadata !DIExpression()), !dbg !29
+ tail call void @llvm.dbg.value(metadata i64 %0, metadata !26, metadata !DIExpression(DW_OP_constu, 4096, DW_OP_minus, DW_OP_stack_value)), !dbg !30
+ ; This debug value cannot safely be split into two 32-bit pieces.
+ ; CHECK-NOT: DW_AT_name(offset)
+ tail call void @llvm.dbg.value(metadata i32 undef, metadata !23, metadata !DIExpression()), !dbg !31
+ br i1 undef, label %for.end, label %for.body.lr.ph, !dbg !31
+
+for.body.lr.ph: ; preds = %entry
+ %0 = load i64, i64* %start, align 4, !dbg !33
+ br label %for.body, !dbg !31
+
+for.body: ; preds = %for.body, %for.body.lr.ph
+ %offset.010.in = phi i64 [ %0, %for.body.lr.ph ], [ %offset.010, %for.body ]
+ %head_size.09 = phi i32 [ undef, %for.body.lr.ph ], [ %sub2, %for.body ]
+ %offset.010 = add i64 %offset.010.in, -4096
+ tail call void @llvm.dbg.value(metadata i32 %head_size.09, metadata !23, metadata !DIExpression()), !dbg !30
+ %call = tail call i32 bitcast (i32 (...)* @use to i32 (i64, %struct.vm_object*)*)(i64 %offset.010, %struct.vm_object* %object) #2, !dbg !34
+ %sub2 = add i32 %head_size.09, -4096, !dbg !37
+ tail call void @llvm.dbg.value(metadata i64 %offset.010, metadata !26, metadata !DIExpression(DW_OP_constu, 4096, DW_OP_minus, DW_OP_stack_value)), !dbg !29
+ tail call void @llvm.dbg.value(metadata i32 %sub2, metadata !23, metadata !DIExpression()), !dbg !30
+ %tobool = icmp eq i32 %sub2, 0, !dbg !31
+ br i1 %tobool, label %for.end, label %for.body, !dbg !31, !llvm.loop !38
+
+for.end: ; preds = %for.body, %entry
+ ret void, !dbg !40
+}
+
+declare i32 @use(...) local_unnamed_addr
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind ssp }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { nobuiltin nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!5, !6, !7, !8, !9}
+!llvm.ident = !{!10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 (trunk 317434) (llvm/trunk 317437)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "test.i", directory: "/Data/radar/31209283")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!5 = !{i32 2, !"Dwarf Version", i32 2}
+!6 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = !{i32 1, !"wchar_size", i32 4}
+!8 = !{i32 1, !"min_enum_size", i32 4}
+!9 = !{i32 7, !"PIC Level", i32 2}
+!10 = !{!"clang version 6.0.0 (trunk 317434) (llvm/trunk 317437)"}
+!11 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 6, type: !12, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !20)
+!12 = !DISubroutineType(types: !13)
+!13 = !{null, !14, !19}
+!14 = !DIDerivedType(tag: DW_TAG_typedef, name: "v_t", file: !1, line: 1, baseType: !15)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 32)
+!16 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "v", file: !1, line: 2, size: 64, elements: !17)
+!17 = !{!18}
+!18 = !DIDerivedType(tag: DW_TAG_member, name: "p", scope: !16, file: !1, line: 3, baseType: !4, size: 64)
+!19 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 32)
+!20 = !{!21, !22, !23, !25, !26}
+!21 = !DILocalVariable(name: "object", arg: 1, scope: !11, file: !1, line: 6, type: !14)
+!22 = !DILocalVariable(name: "start", arg: 2, scope: !11, file: !1, line: 6, type: !19)
+!23 = !DILocalVariable(name: "head_size", scope: !11, file: !1, line: 7, type: !24)
+!24 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!25 = !DILocalVariable(name: "orig_start", scope: !11, file: !1, line: 8, type: !4)
+!26 = !DILocalVariable(name: "offset", scope: !11, file: !1, line: 9, type: !4)
+!27 = !DILocation(line: 6, column: 20, scope: !11)
+!28 = !DILocation(line: 6, column: 48, scope: !11)
+!29 = !DILocation(line: 7, column: 12, scope: !11)
+!30 = !DILocation(line: 10, column: 16, scope: !11)
+!31 = !DILocation(line: 11, column: 5, scope: !32)
+!32 = distinct !DILexicalBlock(scope: !11, file: !1, line: 11, column: 5)
+!33 = !DILocation(line: 8, column: 22, scope: !11)
+!34 = !DILocation(line: 13, column: 7, scope: !35)
+!35 = distinct !DILexicalBlock(scope: !36, file: !1, line: 12, column: 75)
+!36 = distinct !DILexicalBlock(scope: !32, file: !1, line: 11, column: 5)
+!37 = !DILocation(line: 12, column: 61, scope: !36)
+!38 = distinct !{!38, !31, !39}
+!39 = !DILocation(line: 14, column: 3, scope: !32)
+!40 = !DILocation(line: 15, column: 1, scope: !11)
diff --git a/test/DebugInfo/ARM/salvage-debug-info.ll b/test/DebugInfo/ARM/salvage-debug-info.ll
new file mode 100644
index 00000000000..5509b92a5c1
--- /dev/null
+++ b/test/DebugInfo/ARM/salvage-debug-info.ll
@@ -0,0 +1,118 @@
+; RUN: opt -codegenprepare -S %s -o - | FileCheck %s
+; typedef struct info {
+; unsigned long long size;
+; } info_t;
+; extern unsigned p;
+; extern unsigned n;
+; void f() {
+; unsigned int i;
+; if (p) {
+; info_t *info = (info_t *)p;
+; for (i = 0; i < n; i++)
+; use(info[i].size);
+; }
+; }
+source_filename = "debug.i"
+target datalayout = "e-m:o-p:32:32-i64:64-a:0:32-n32-S128"
+target triple = "thumbv7k-apple-ios10.0.0"
+
+%struct.info = type { i64 }
+
+@p = external local_unnamed_addr global i32, align 4
+@n = external local_unnamed_addr global i32, align 4
+
+; Function Attrs: nounwind ssp uwtable
+define void @f() local_unnamed_addr #0 !dbg !16 {
+entry:
+ %0 = load i32, i32* @p, align 4, !dbg !25
+ %tobool = icmp eq i32 %0, 0, !dbg !25
+ br i1 %tobool, label %if.end, label %if.then, !dbg !26
+
+if.then: ; preds = %entry
+ %1 = inttoptr i32 %0 to %struct.info*, !dbg !27
+ tail call void @llvm.dbg.value(metadata %struct.info* %1, metadata !22, metadata !DIExpression()), !dbg !28
+ ; CHECK: call void @llvm.dbg.value(metadata i32 %0, metadata !22, metadata !DIExpression())
+ tail call void @llvm.dbg.value(metadata i32 0, metadata !20, metadata !DIExpression()), !dbg !29
+ %2 = load i32, i32* @n, align 4, !dbg !30
+ %cmp5 = icmp eq i32 %2, 0, !dbg !33
+ br i1 %cmp5, label %if.end, label %for.body.preheader, !dbg !34
+
+for.body.preheader: ; preds = %if.then
+ ; CHECK: for.body.preheader:
+ ; CHECK: %2 = inttoptr i32 %0 to %struct.info*
+ br label %for.body, !dbg !35
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %lsr.iv = phi %struct.info* [ %1, %for.body.preheader ], [ %scevgep, %for.body ]
+ %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %lsr.iv7 = bitcast %struct.info* %lsr.iv to i64*
+ tail call void @llvm.dbg.value(metadata i32 %i.06, metadata !20, metadata !DIExpression()), !dbg !29
+ %3 = load i64, i64* %lsr.iv7, align 8, !dbg !35
+ %call = tail call i32 bitcast (i32 (...)* @use to i32 (i64)*)(i64 %3) #3, !dbg !36
+ %inc = add nuw i32 %i.06, 1, !dbg !37
+ tail call void @llvm.dbg.value(metadata i32 %inc, metadata !20, metadata !DIExpression()), !dbg !29
+ %4 = load i32, i32* @n, align 4, !dbg !30
+ %scevgep = getelementptr %struct.info, %struct.info* %lsr.iv, i32 1, !dbg !33
+ %cmp = icmp ult i32 %inc, %4, !dbg !33
+ br i1 %cmp, label %for.body, label %if.end.loopexit, !dbg !34, !llvm.loop !38
+
+if.end.loopexit: ; preds = %for.body
+ br label %if.end, !dbg !40
+
+if.end: ; preds = %if.end.loopexit, %if.then, %entry
+ ret void, !dbg !40
+}
+declare i32 @use(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+attributes #0 = { nounwind ssp uwtable }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { nobuiltin nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10, !11, !12, !13, !14}
+!llvm.ident = !{!15}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "debug.i", directory: "/Data/radar/35321562")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 32)
+!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "info_t", file: !1, line: 3, baseType: !6)
+!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "info", file: !1, line: 1, size: 64, elements: !7)
+!7 = !{!8}
+!8 = !DIDerivedType(tag: DW_TAG_member, name: "size", scope: !6, file: !1, line: 2, baseType: !9, size: 64)
+!9 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!10 = !{i32 2, !"Dwarf Version", i32 4}
+!11 = !{i32 2, !"Debug Info Version", i32 3}
+!12 = !{i32 1, !"wchar_size", i32 4}
+!13 = !{i32 1, !"min_enum_size", i32 4}
+!14 = !{i32 7, !"PIC Level", i32 2}
+!15 = !{!"clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)"}
+!16 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 6, type: !17, isLocal: false, isDefinition: true, scopeLine: 6, isOptimized: true, unit: !0, variables: !19)
+!17 = !DISubroutineType(types: !18)
+!18 = !{null}
+!19 = !{!20, !22}
+!20 = !DILocalVariable(name: "i", scope: !16, file: !1, line: 7, type: !21)
+!21 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!22 = !DILocalVariable(name: "info", scope: !23, file: !1, line: 9, type: !4)
+!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 8, column: 10)
+!24 = distinct !DILexicalBlock(scope: !16, file: !1, line: 8, column: 7)
+!25 = !DILocation(line: 8, column: 7, scope: !24)
+!26 = !DILocation(line: 8, column: 7, scope: !16)
+!27 = !DILocation(line: 9, column: 20, scope: !23)
+!28 = !DILocation(line: 9, column: 13, scope: !23)
+!29 = !DILocation(line: 7, column: 16, scope: !16)
+!30 = !DILocation(line: 10, column: 21, scope: !31)
+!31 = distinct !DILexicalBlock(scope: !32, file: !1, line: 10, column: 5)
+!32 = distinct !DILexicalBlock(scope: !23, file: !1, line: 10, column: 5)
+!33 = !DILocation(line: 10, column: 19, scope: !31)
+!34 = !DILocation(line: 10, column: 5, scope: !32)
+!35 = !DILocation(line: 11, column: 19, scope: !31)
+!36 = !DILocation(line: 11, column: 7, scope: !31)
+!37 = !DILocation(line: 10, column: 25, scope: !31)
+!38 = distinct !{!38, !34, !39}
+!39 = !DILocation(line: 11, column: 23, scope: !32)
+!40 = !DILocation(line: 13, column: 1, scope: !16)
diff --git a/test/DebugInfo/Generic/location-verifier.ll b/test/DebugInfo/Generic/location-verifier.ll
index b1e0805428c..3c6bb425a66 100644
--- a/test/DebugInfo/Generic/location-verifier.ll
+++ b/test/DebugInfo/Generic/location-verifier.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as -disable-output -verify-debug-info -o - < %s 2>&1 | FileCheck %s
+; RUN: llvm-as -disable-output -o - < %s 2>&1 | FileCheck %s
; ModuleID = 'test.c'
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.10.0"
diff --git a/test/DebugInfo/Generic/missing-abstract-variable.ll b/test/DebugInfo/Generic/missing-abstract-variable.ll
index 16dcdebd1f1..8d5aff4084d 100644
--- a/test/DebugInfo/Generic/missing-abstract-variable.ll
+++ b/test/DebugInfo/Generic/missing-abstract-variable.ll
@@ -2,11 +2,6 @@
; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -v -debug-info - | FileCheck %s
-; The formal parameter 'b' for Function 'x' when inlined within 'a' is lost on
-; mips and powerpc64 (and on x86_64 at at least -O2). Presumably this is a
-; SelectionDAG issue (do mips/powerpc64 use FastISel?).
-; XFAIL: mips, powerpc64, s390x, sparc
-
; Build from the following source with clang -O2.
; The important details are that 'x's abstract definition is first built during
diff --git a/test/DebugInfo/Inputs/dwarfdump-header.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-header.elf-x86-64
deleted file mode 100644
index 21c1eacd071..00000000000
--- a/test/DebugInfo/Inputs/dwarfdump-header.elf-x86-64
+++ /dev/null
Binary files differ
diff --git a/test/DebugInfo/X86/dwarfdump-header-64.s b/test/DebugInfo/X86/dwarfdump-header-64.s
new file mode 100644
index 00000000000..f0baa592d8d
--- /dev/null
+++ b/test/DebugInfo/X86/dwarfdump-header-64.s
@@ -0,0 +1,149 @@
+# Test object to verify dwarfdump handles a DWARF-64 v5 line header.
+# FIXME: Make the other headers DWARF-64 also.
+# FIXME: Add variants for earlier DWARF versions.
+
+# Lines beginning with @ELF@ should be preserved for ELF targets;
+# lines beginning with @MACHO@ should be preserved for Mach-O targets.
+
+# RUN: sed -e 's/@ELF@//;s/@MACHO@.*//' %s | \
+# RUN: llvm-mc -triple x86_64-unknown-linux -filetype=obj -o - | \
+# RUN: llvm-dwarfdump -v - | FileCheck %s
+
+# RUN: sed -e 's/@ELF@.*//;s/@MACHO@//' %s | \
+# RUN: llvm-mc -triple x86_64-apple-darwin -filetype=obj -o - | \
+# RUN: llvm-dwarfdump -v - | FileCheck %s
+
+
+@ELF@ .section .debug_str,"MS",@progbits,1
+@MACHO@ .section __DWARF,__debug_str,regular,debug
+str_producer:
+ .asciz "Handmade DWARF producer"
+str_CU_5:
+ .asciz "V5_compile_unit"
+str_LT_5a:
+ .asciz "Directory5a"
+str_LT_5b:
+ .asciz "Directory5b"
+
+@ELF@ .section .debug_abbrev,"",@progbits
+@MACHO@ .section __DWARF,__debug_abbrev,regular,debug
+abbrev:
+ .byte 0x01 # Abbrev code
+ .byte 0x11 # DW_TAG_compile_unit
+ .byte 0x00 # DW_CHILDREN_no
+ .byte 0x25 # DW_AT_producer
+ .byte 0x0e # DW_FORM_strp
+ .byte 0x03 # DW_AT_name
+ .byte 0x0e # DW_FORM_strp
+ .byte 0x10 # DW_AT_stmt_list
+ .byte 0x17 # DW_FORM_sec_offset
+ .byte 0x00 # EOM(1)
+ .byte 0x00 # EOM(2)
+
+@ELF@ .section .debug_info,"",@progbits
+@MACHO@ .section __DWARF,__debug_info,regular,debug
+
+# DWARF-32 v5 normal CU header.
+Lset0 = CU_5_end-CU_5_version # Length of Unit
+ .long Lset0
+CU_5_version:
+ .short 5 # DWARF version number
+ .byte 1 # DWARF Unit Type
+ .byte 8 # Address Size (in bytes)
+@ELF@ .long abbrev # Offset Into Abbrev. Section
+@MACHO@ .long 0
+# The compile-unit DIE, with DW_AT_producer, DW_AT_name, DW_AT_stmt_list.
+ .byte 1
+ .long str_producer
+ .long str_CU_5
+@ELF@ .long LH_5_start
+@MACHO@ .long 0
+ .byte 0 # NULL
+CU_5_end:
+
+# CHECK-LABEL: .debug_info contents:
+# CHECK: 0x00000000: Compile Unit: length = 0x00000016 version = 0x0005 unit_type = DW_UT_compile abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x0000001a)
+# CHECK: 0x0000000c: DW_TAG_compile_unit
+# CHECK-NEXT: DW_AT_producer {{.*}} "Handmade DWARF producer"
+# CHECK-NEXT: DW_AT_name {{.*}} "V5_compile_unit"
+# CHECK-NEXT: DW_AT_stmt_list {{.*}} (0x00000000)
+
+@ELF@ .section .debug_line,"",@progbits
+@MACHO@ .section __DWARF,__debug_line,regular,debug
+
+# DWARF-64 v5 line-table header.
+LH_5_start:
+ .long -1
+Lset1 = LH_5_end-LH_5_version # Length of Unit
+ .quad Lset1
+LH_5_version:
+ .short 5 # DWARF version number
+ .byte 8 # Address Size
+ .byte 0 # Segment Selector Size
+Lset2 = LH_5_header_end-LH_5_params # Length of Prologue
+ .quad Lset2
+LH_5_params:
+ .byte 1 # Minimum Instruction Length
+ .byte 1 # Maximum Operations per Instruction
+ .byte 1 # Default is_stmt
+ .byte -5 # Line Base
+ .byte 14 # Line Range
+ .byte 13 # Opcode Base
+ .byte 0 # Standard Opcode Lengths
+ .byte 1
+ .byte 1
+ .byte 1
+ .byte 1
+ .byte 0
+ .byte 0
+ .byte 0
+ .byte 1
+ .byte 0
+ .byte 0
+ .byte 1
+ # Directory table format
+ .byte 1 # One element per directory entry
+ .byte 1 # DW_LNCT_path
+ .byte 0x0e # DW_FORM_strp (-> .debug_str)
+ # Directory table entries
+ .byte 2 # Two directories
+ .quad str_LT_5a
+ .quad str_LT_5b
+ # File table format
+ .byte 4 # Four elements per file entry
+ .byte 1 # DW_LNCT_path
+ .byte 0x08 # DW_FORM_string
+ .byte 2 # DW_LNCT_directory_index
+ .byte 0x0b # DW_FORM_data1
+ .byte 3 # DW_LNCT_timestamp
+ .byte 0x0f # DW_FORM_udata
+ .byte 4 # DW_LNCT_size
+ .byte 0x0f # DW_FORM_udata
+ # File table entries
+ .byte 2 # Two files
+ .asciz "File5a"
+ .byte 1
+ .byte 0x51
+ .byte 0x52
+ .asciz "File5b"
+ .byte 2
+ .byte 0x53
+ .byte 0x54
+LH_5_header_end:
+ # Line number program, which is empty.
+LH_5_end:
+
+# CHECK-LABEL: .debug_line contents:
+# CHECK: Line table prologue:
+# CHECK: total_length: 0x00000050
+# CHECK: version: 5
+# CHECK: address_size: 8
+# CHECK: seg_select_size: 0
+# CHECK: prologue_length: 0x00000044
+# CHECK: max_ops_per_inst: 1
+# CHECK: include_directories[ 1] = 'Directory5a'
+# CHECK: include_directories[ 2] = 'Directory5b'
+# CHECK-NOT: include_directories
+# CHECK: file_names[ 1] 1 0x00000051 0x00000052 File5a{{$}}
+# CHECK: file_names[ 2] 2 0x00000053 0x00000054 File5b{{$}}
+# CHECK-NOT: file_names
diff --git a/test/DebugInfo/Inputs/dwarfdump-header.s b/test/DebugInfo/X86/dwarfdump-header.s
index c5cf4859776..d3d4e5a6827 100644
--- a/test/DebugInfo/Inputs/dwarfdump-header.s
+++ b/test/DebugInfo/X86/dwarfdump-header.s
@@ -2,9 +2,8 @@
# We have a representative set of units: v4 CU, v5 CU, v4 TU, v5 split TU.
# We have v4 and v5 line-table headers.
#
-# To generate the test object:
-# llvm-mc -triple x86_64-unknown-linux dwarfdump-header.s -filetype=obj \
-# -o dwarfdump-header.elf-x86-64
+# RUN: llvm-mc -triple x86_64-unknown-linux %s -filetype=obj -o - | \
+# RUN: llvm-dwarfdump -v - | FileCheck %s
.section .debug_str,"MS",@progbits,1
str_producer:
@@ -15,6 +14,10 @@ str_CU_5:
.asciz "V5_compile_unit"
str_TU_4:
.asciz "V4_type_unit"
+str_LT_5a:
+ .asciz "Directory5a"
+str_LT_5b:
+ .asciz "Directory5b"
.section .debug_str.dwo,"MS",@progbits,1
dwo_TU_5:
@@ -77,6 +80,7 @@ dwo_TU_5:
.byte 0x00 # EOM(3)
.section .debug_info,"",@progbits
+# CHECK-LABEL: .debug_info contents:
# DWARF v4 CU header. V4 CU headers all look the same so we do only one.
.long CU_4_end-CU_4_version # Length of Unit
@@ -92,6 +96,9 @@ CU_4_version:
.byte 0 # NULL
CU_4_end:
+# CHECK: 0x00000000: Compile Unit: length = 0x00000015 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000019)
+# CHECK: 0x0000000b: DW_TAG_compile_unit
+
# DWARF v5 normal CU header.
.long CU_5_end-CU_5_version # Length of Unit
CU_5_version:
@@ -107,7 +114,11 @@ CU_5_version:
.byte 0 # NULL
CU_5_end:
+# CHECK: 0x00000019: Compile Unit: length = 0x00000016 version = 0x0005 unit_type = DW_UT_compile abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000033)
+# CHECK: 0x00000025: DW_TAG_compile_unit
+
.section .debug_types,"",@progbits
+# CHECK-LABEL: .debug_types contents:
# DWARF v4 Type unit header. Normal/split are identical so we do only one.
TU_4_start:
@@ -129,8 +140,12 @@ TU_4_type:
.byte 0 # NULL
TU_4_end:
+# CHECK: 0x00000000: Type Unit: length = 0x0000001f version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 name = 'V4_type_unit' type_signature = 0x0011223344556677 type_offset = 0x001c (next unit at 0x00000023)
+# CHECK: 0x00000017: DW_TAG_type_unit
+
.section .debug_types.dwo,"",@progbits
# FIXME: DWARF v5 wants type units in .debug_info[.dwo] not .debug_types[.dwo].
+# CHECK: .debug_types.dwo contents:
# DWARF v5 split type unit header.
TU_split_5_start:
@@ -153,7 +168,12 @@ TU_split_5_type:
.byte 0 # NULL
TU_split_5_end:
+# CHECK: 0x00000000: Type Unit: length = 0x00000020 version = 0x0005 unit_type = DW_UT_split_type abbr_offset = 0x0000 addr_size = 0x08 name = 'V5_split_type_unit' type_signature = 0x8899aabbccddeeff type_offset = 0x001d (next unit at 0x00000024)
+# CHECK: 0x00000018: DW_TAG_type_unit
+
.section .debug_line,"",@progbits
+# CHECK-LABEL: .debug_line contents:
+
# DWARF v4 line-table header.
LH_4_start:
.long LH_4_end-LH_4_version # Length of Unit
@@ -197,6 +217,18 @@ LH_4_header_end:
# Line number program, which is empty.
LH_4_end:
+# CHECK: Line table prologue:
+# CHECK: version: 4
+# CHECK-NOT: address_size
+# CHECK-NOT: seg_select_size
+# CHECK: max_ops_per_inst: 1
+# CHECK: include_directories[ 1] = 'Directory4a'
+# CHECK: include_directories[ 2] = 'Directory4b'
+# CHECK-NOT: include_directories
+# CHECK: file_names[ 1] 1 0x00000041 0x00000042 File4a{{$}}
+# CHECK: file_names[ 2] 0 0x00000043 0x00000044 File4b{{$}}
+# CHECK-NOT: file_names
+
# DWARF v5 line-table header.
LH_5_start:
.long LH_5_end-LH_5_version # Length of Unit
@@ -227,11 +259,11 @@ LH_5_params:
# Directory table format
.byte 1 # One element per directory entry
.byte 1 # DW_LNCT_path
- .byte 0x08 # DW_FORM_string
+ .byte 0x0e # DW_FORM_strp (-> .debug_str)
# Directory table entries
.byte 2 # Two directories
- .asciz "Directory5a"
- .asciz "Directory5b"
+ .long str_LT_5a
+ .long str_LT_5b
# File table format
.byte 4 # Four elements per file entry
.byte 1 # DW_LNCT_path
@@ -255,3 +287,15 @@ LH_5_params:
LH_5_header_end:
# Line number program, which is empty.
LH_5_end:
+
+# CHECK: Line table prologue:
+# CHECK: version: 5
+# CHECK: address_size: 8
+# CHECK: seg_select_size: 0
+# CHECK: max_ops_per_inst: 1
+# CHECK: include_directories[ 1] = 'Directory5a'
+# CHECK: include_directories[ 2] = 'Directory5b'
+# CHECK-NOT: include_directories
+# CHECK: file_names[ 1] 1 0x00000051 0x00000052 File5a{{$}}
+# CHECK: file_names[ 2] 2 0x00000053 0x00000054 File5b{{$}}
+# CHECK-NOT: file_names
diff --git a/test/DebugInfo/X86/live-debug-variables.ll b/test/DebugInfo/X86/live-debug-variables.ll
index fbfd1d91a81..90669f5412c 100644
--- a/test/DebugInfo/X86/live-debug-variables.ll
+++ b/test/DebugInfo/X86/live-debug-variables.ll
@@ -24,8 +24,9 @@
; CHECK: .debug_loc contents:
; CHECK-NEXT: 0x00000000:
-; CHECK-NEXT: 0x000000000000001f - 0x000000000000003c: DW_OP_reg3 RBX
-; We should only have one entry
+; We currently emit an entry for the function prologue, too, which could be optimized away.
+; CHECK: 0x000000000000001f - 0x000000000000003c: DW_OP_reg3 RBX
+; We should only have one entry inside the function.
; CHECK-NOT: :
declare i32 @foobar(i32, i32, i32, i32, i32)
diff --git a/test/DebugInfo/dwarfdump-header.test b/test/DebugInfo/dwarfdump-header.test
deleted file mode 100644
index 375f7043c9f..00000000000
--- a/test/DebugInfo/dwarfdump-header.test
+++ /dev/null
@@ -1,60 +0,0 @@
-RUN: llvm-dwarfdump -v %p/Inputs/dwarfdump-header.elf-x86-64 | FileCheck %s
-RUN: llvm-dwarfdump -v --verify %p/Inputs/dwarfdump-header.elf-x86-64
-
-The input file is hand-coded assembler to generate all the units,
-so we're willing to make exact checks for offsets and such.
-
-CHECK-LABEL: .debug_info contents:
-
-The v4 CU header.
-
-CHECK: 0x00000000: Compile Unit: length = 0x00000015 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000019)
-CHECK: 0x0000000b: DW_TAG_compile_unit
-
-The v5 normal CU header.
-
-CHECK: 0x00000019: Compile Unit: length = 0x00000016 version = 0x0005 unit_type = DW_UT_compile abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000033)
-CHECK: 0x00000025: DW_TAG_compile_unit
-
-CHECK-LABEL: .debug_types contents:
-
-The v4 type unit header.
-
-CHECK: 0x00000000: Type Unit: length = 0x0000001f version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 name = 'V4_type_unit' type_signature = 0x0011223344556677 type_offset = 0x001c (next unit at 0x00000023)
-CHECK: 0x00000017: DW_TAG_type_unit
-
-FIXME: DWARF v5 wants type units in .debug_info[.dwo] not .debug_types[.dwo].
-CHECK: .debug_types.dwo contents:
-
-CHECK: 0x00000000: Type Unit: length = 0x00000020 version = 0x0005 unit_type = DW_UT_split_type abbr_offset = 0x0000 addr_size = 0x08 name = 'V5_split_type_unit' type_signature = 0x8899aabbccddeeff type_offset = 0x001d (next unit at 0x00000024)
-CHECK: 0x00000018: DW_TAG_type_unit
-
-CHECK-LABEL: .debug_line contents:
-
-The v4 line table header.
-
-CHECK: Line table prologue:
-CHECK: version: 4
-CHECK-NOT: address_size
-CHECK-NOT: seg_select_size
-CHECK: max_ops_per_inst: 1
-CHECK: include_directories[ 1] = 'Directory4a'
-CHECK: include_directories[ 2] = 'Directory4b'
-CHECK-NOT: include_directories
-CHECK: file_names[ 1] 1 0x00000041 0x00000042 File4a{{$}}
-CHECK: file_names[ 2] 0 0x00000043 0x00000044 File4b{{$}}
-CHECK-NOT: file_names
-
-The v5 line table header.
-
-CHECK: Line table prologue:
-CHECK: version: 5
-CHECK: address_size: 8
-CHECK: seg_select_size: 0
-CHECK: max_ops_per_inst: 1
-CHECK: include_directories[ 1] = 'Directory5a'
-CHECK: include_directories[ 2] = 'Directory5b'
-CHECK-NOT: include_directories
-CHECK: file_names[ 1] 1 0x00000051 0x00000052 File5a{{$}}
-CHECK: file_names[ 2] 2 0x00000053 0x00000054 File5b{{$}}
-CHECK-NOT: file_names
diff --git a/test/FileCheck/defines.txt b/test/FileCheck/defines.txt
new file mode 100644
index 00000000000..d2219b7ca25
--- /dev/null
+++ b/test/FileCheck/defines.txt
@@ -0,0 +1,9 @@
+; RUN: FileCheck -DVALUE=10 -input-file %s %s
+; RUN: not FileCheck -DVALUE=20 -input-file %s %s 2>&1 | FileCheck %s -check-prefix ERRMSG
+
+Value = 10
+; CHECK: Value = [[VALUE]]
+
+; ERRMSG: defines.txt:5:10: error: expected string not found in input
+; ERRMSG: defines.txt:1:1: note: with variable "VALUE" equal to "20"
+; ERRMSG: defines.txt:4:1: note: possible intended match here
diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll b/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll
index c3c2435fc87..1fc20febc94 100644
--- a/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll
@@ -39,8 +39,7 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK: [[B]]:
; CHECK-NEXT: popfq
-; CHECK: rep
-; CHECK-NEXT: movsb (%rsi), %es:(%rdi)
+; CHECK: rep movsb (%rsi), %es:(%rdi)
; Function Attrs: nounwind sanitize_address uwtable
define void @rep_movs_1b(i8* %dst, i8* %src, i64 %n) #0 {
@@ -73,8 +72,7 @@ entry:
; CHECK: [[Q]]:
; CHECK-NEXT: popfq
-; CHECK: rep
-; CHECK-NEXT: movsq (%rsi), %es:(%rdi)
+; CHECK: rep movsq (%rsi), %es:(%rdi)
; Function Attrs: nounwind sanitize_address uwtable
define void @rep_movs_8b(i64* %dst, i64* %src, i64 %n) #0 {
diff --git a/test/LTO/Resolution/X86/comdat-mixed-lto.ll b/test/LTO/Resolution/X86/comdat-mixed-lto.ll
index f6ee22e4161..d6022c64351 100644
--- a/test/LTO/Resolution/X86/comdat-mixed-lto.ll
+++ b/test/LTO/Resolution/X86/comdat-mixed-lto.ll
@@ -17,7 +17,7 @@
; would clash with the copy from this module.
; RUN: llvm-dis %t3.0.0.preopt.bc -o - | FileCheck %s
; CHECK: define internal void @__cxx_global_var_init() section ".text.startup" {
-; CHECK: define available_externally void @testglobfunc() section ".text.startup" {
+; CHECK: define available_externally dso_local void @testglobfunc() section ".text.startup" {
; ModuleID = 'comdat-mixed-lto.o'
source_filename = "comdat-mixed-lto.cpp"
diff --git a/test/LTO/Resolution/X86/comdat.ll b/test/LTO/Resolution/X86/comdat.ll
index 60d082b3e0f..94f28384231 100644
--- a/test/LTO/Resolution/X86/comdat.ll
+++ b/test/LTO/Resolution/X86/comdat.ll
@@ -70,14 +70,14 @@ bb11:
; CHECK-DAG: @a23 = alias i32 (i8*), i32 (i8*)* @f1.2{{$}}
; CHECK-DAG: @a24 = alias i16, bitcast (i32 (i8*)* @f1.2 to i16*)
-; CHECK: define weak_odr i32 @f1(i8*) comdat($c1) {
+; CHECK: define weak_odr dso_local i32 @f1(i8*) comdat($c1) {
; CHECK-NEXT: bb10:
; CHECK-NEXT: br label %bb11{{$}}
; CHECK: bb11:
; CHECK-NEXT: ret i32 42
; CHECK-NEXT: }
-; CHECK: define internal i32 @f1.2(i8* %this) comdat($c2) {
+; CHECK: define internal dso_local i32 @f1.2(i8* %this) comdat($c2) {
; CHECK-NEXT: bb20:
; CHECK-NEXT: store i8* %this, i8** null
; CHECK-NEXT: br label %bb21
diff --git a/test/LTO/Resolution/X86/commons.ll b/test/LTO/Resolution/X86/commons.ll
index 28bf1ada4a8..8adfb87d6ed 100644
--- a/test/LTO/Resolution/X86/commons.ll
+++ b/test/LTO/Resolution/X86/commons.ll
@@ -4,7 +4,7 @@
; RUN: llvm-dis -o - %t.out.0.0.preopt.bc | FileCheck %s
; A strong definition should override the common
-; CHECK: @x = global i32 42, align 4
+; CHECK: @x = dso_local global i32 42, align 4
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/MC/AArch64/SVE/assembler_tests/add.s b/test/MC/AArch64/SVE/assembler_tests/add.s
new file mode 100644
index 00000000000..7906dbbaf88
--- /dev/null
+++ b/test/MC/AArch64/SVE/assembler_tests/add.s
@@ -0,0 +1,66 @@
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -mattr=+sve < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -mattr=-sve 2>&1 < %s | FileCheck --check-prefix=CHECK-ERROR %s
+add z31.s, z31.s, z31.s // 00000100-10111111-00000011-11111111
+// CHECK: add z31.s, z31.s, z31.s // encoding: [0xff,0x03,0xbf,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10111111-00000011-11111111
+add z23.d, z13.d, z8.d // 00000100-11101000-00000001-10110111
+// CHECK: add z23.d, z13.d, z8.d // encoding: [0xb7,0x01,0xe8,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11101000-00000001-10110111
+add z0.s, z0.s, z0.s // 00000100-10100000-00000000-00000000
+// CHECK: add z0.s, z0.s, z0.s // encoding: [0x00,0x00,0xa0,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10100000-00000000-00000000
+add z31.d, z31.d, z31.d // 00000100-11111111-00000011-11111111
+// CHECK: add z31.d, z31.d, z31.d // encoding: [0xff,0x03,0xff,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11111111-00000011-11111111
+add z21.b, z10.b, z21.b // 00000100-00110101-00000001-01010101
+// CHECK: add z21.b, z10.b, z21.b // encoding: [0x55,0x01,0x35,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00110101-00000001-01010101
+add z31.b, z31.b, z31.b // 00000100-00111111-00000011-11111111
+// CHECK: add z31.b, z31.b, z31.b // encoding: [0xff,0x03,0x3f,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00111111-00000011-11111111
+add z0.h, z0.h, z0.h // 00000100-01100000-00000000-00000000
+// CHECK: add z0.h, z0.h, z0.h // encoding: [0x00,0x00,0x60,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01100000-00000000-00000000
+add z23.b, z13.b, z8.b // 00000100-00101000-00000001-10110111
+// CHECK: add z23.b, z13.b, z8.b // encoding: [0xb7,0x01,0x28,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00101000-00000001-10110111
+add z0.d, z0.d, z0.d // 00000100-11100000-00000000-00000000
+// CHECK: add z0.d, z0.d, z0.d // encoding: [0x00,0x00,0xe0,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11100000-00000000-00000000
+add z31.h, z31.h, z31.h // 00000100-01111111-00000011-11111111
+// CHECK: add z31.h, z31.h, z31.h // encoding: [0xff,0x03,0x7f,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01111111-00000011-11111111
+add z0.b, z0.b, z0.b // 00000100-00100000-00000000-00000000
+// CHECK: add z0.b, z0.b, z0.b // encoding: [0x00,0x00,0x20,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00100000-00000000-00000000
+add z21.d, z10.d, z21.d // 00000100-11110101-00000001-01010101
+// CHECK: add z21.d, z10.d, z21.d // encoding: [0x55,0x01,0xf5,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11110101-00000001-01010101
+add z21.h, z10.h, z21.h // 00000100-01110101-00000001-01010101
+// CHECK: add z21.h, z10.h, z21.h // encoding: [0x55,0x01,0x75,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01110101-00000001-01010101
+add z21.s, z10.s, z21.s // 00000100-10110101-00000001-01010101
+// CHECK: add z21.s, z10.s, z21.s // encoding: [0x55,0x01,0xb5,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10110101-00000001-01010101
+add z23.h, z13.h, z8.h // 00000100-01101000-00000001-10110111
+// CHECK: add z23.h, z13.h, z8.h // encoding: [0xb7,0x01,0x68,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01101000-00000001-10110111
+add z23.s, z13.s, z8.s // 00000100-10101000-00000001-10110111
+// CHECK: add z23.s, z13.s, z8.s // encoding: [0xb7,0x01,0xa8,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10101000-00000001-10110111
diff --git a/test/MC/AArch64/SVE/assembler_tests/sub.s b/test/MC/AArch64/SVE/assembler_tests/sub.s
new file mode 100644
index 00000000000..ee283afdb7f
--- /dev/null
+++ b/test/MC/AArch64/SVE/assembler_tests/sub.s
@@ -0,0 +1,66 @@
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -mattr=+sve < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -mattr=-sve 2>&1 < %s | FileCheck --check-prefix=CHECK-ERROR %s
+sub z0.h, z0.h, z0.h // 00000100-01100000-00000100-00000000
+// CHECK: sub z0.h, z0.h, z0.h // encoding: [0x00,0x04,0x60,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01100000-00000100-00000000
+sub z21.b, z10.b, z21.b // 00000100-00110101-00000101-01010101
+// CHECK: sub z21.b, z10.b, z21.b // encoding: [0x55,0x05,0x35,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00110101-00000101-01010101
+sub z31.h, z31.h, z31.h // 00000100-01111111-00000111-11111111
+// CHECK: sub z31.h, z31.h, z31.h // encoding: [0xff,0x07,0x7f,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01111111-00000111-11111111
+sub z21.h, z10.h, z21.h // 00000100-01110101-00000101-01010101
+// CHECK: sub z21.h, z10.h, z21.h // encoding: [0x55,0x05,0x75,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01110101-00000101-01010101
+sub z31.b, z31.b, z31.b // 00000100-00111111-00000111-11111111
+// CHECK: sub z31.b, z31.b, z31.b // encoding: [0xff,0x07,0x3f,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00111111-00000111-11111111
+sub z0.s, z0.s, z0.s // 00000100-10100000-00000100-00000000
+// CHECK: sub z0.s, z0.s, z0.s // encoding: [0x00,0x04,0xa0,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10100000-00000100-00000000
+sub z23.b, z13.b, z8.b // 00000100-00101000-00000101-10110111
+// CHECK: sub z23.b, z13.b, z8.b // encoding: [0xb7,0x05,0x28,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00101000-00000101-10110111
+sub z21.d, z10.d, z21.d // 00000100-11110101-00000101-01010101
+// CHECK: sub z21.d, z10.d, z21.d // encoding: [0x55,0x05,0xf5,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11110101-00000101-01010101
+sub z21.s, z10.s, z21.s // 00000100-10110101-00000101-01010101
+// CHECK: sub z21.s, z10.s, z21.s // encoding: [0x55,0x05,0xb5,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10110101-00000101-01010101
+sub z0.b, z0.b, z0.b // 00000100-00100000-00000100-00000000
+// CHECK: sub z0.b, z0.b, z0.b // encoding: [0x00,0x04,0x20,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-00100000-00000100-00000000
+sub z23.d, z13.d, z8.d // 00000100-11101000-00000101-10110111
+// CHECK: sub z23.d, z13.d, z8.d // encoding: [0xb7,0x05,0xe8,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11101000-00000101-10110111
+sub z23.s, z13.s, z8.s // 00000100-10101000-00000101-10110111
+// CHECK: sub z23.s, z13.s, z8.s // encoding: [0xb7,0x05,0xa8,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10101000-00000101-10110111
+sub z31.d, z31.d, z31.d // 00000100-11111111-00000111-11111111
+// CHECK: sub z31.d, z31.d, z31.d // encoding: [0xff,0x07,0xff,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11111111-00000111-11111111
+sub z23.h, z13.h, z8.h // 00000100-01101000-00000101-10110111
+// CHECK: sub z23.h, z13.h, z8.h // encoding: [0xb7,0x05,0x68,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-01101000-00000101-10110111
+sub z0.d, z0.d, z0.d // 00000100-11100000-00000100-00000000
+// CHECK: sub z0.d, z0.d, z0.d // encoding: [0x00,0x04,0xe0,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-11100000-00000100-00000000
+sub z31.s, z31.s, z31.s // 00000100-10111111-00000111-11111111
+// CHECK: sub z31.s, z31.s, z31.s // encoding: [0xff,0x07,0xbf,0x04]
+// CHECK-ERROR: invalid operand for instruction
+// CHECK-ERROR-NEXT: 00000100-10111111-00000111-11111111
diff --git a/test/MC/AArch64/SVE/disassembler_tests/add.s b/test/MC/AArch64/SVE/disassembler_tests/add.s
new file mode 100644
index 00000000000..22a61fb4a84
--- /dev/null
+++ b/test/MC/AArch64/SVE/disassembler_tests/add.s
@@ -0,0 +1,50 @@
+# RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -disassemble -mattr=+sve < %s | FileCheck %s
+# RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -disassemble -mattr=-sve 2>&1 < %s | FileCheck --check-prefix=CHECK-ERROR %s
+0xff,0x03,0xbf,0x04
+# CHECK: add z31.s, z31.s, z31.s // encoding: [0xff,0x03,0xbf,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x01,0xe8,0x04
+# CHECK: add z23.d, z13.d, z8.d // encoding: [0xb7,0x01,0xe8,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x00,0x00,0xa0,0x04
+# CHECK: add z0.s, z0.s, z0.s // encoding: [0x00,0x00,0xa0,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xff,0x03,0xff,0x04
+# CHECK: add z31.d, z31.d, z31.d // encoding: [0xff,0x03,0xff,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x01,0x35,0x04
+# CHECK: add z21.b, z10.b, z21.b // encoding: [0x55,0x01,0x35,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xff,0x03,0x3f,0x04
+# CHECK: add z31.b, z31.b, z31.b // encoding: [0xff,0x03,0x3f,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x00,0x00,0x60,0x04
+# CHECK: add z0.h, z0.h, z0.h // encoding: [0x00,0x00,0x60,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x01,0x28,0x04
+# CHECK: add z23.b, z13.b, z8.b // encoding: [0xb7,0x01,0x28,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x00,0x00,0xe0,0x04
+# CHECK: add z0.d, z0.d, z0.d // encoding: [0x00,0x00,0xe0,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xff,0x03,0x7f,0x04
+# CHECK: add z31.h, z31.h, z31.h // encoding: [0xff,0x03,0x7f,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x00,0x00,0x20,0x04
+# CHECK: add z0.b, z0.b, z0.b // encoding: [0x00,0x00,0x20,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x01,0xf5,0x04
+# CHECK: add z21.d, z10.d, z21.d // encoding: [0x55,0x01,0xf5,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x01,0x75,0x04
+# CHECK: add z21.h, z10.h, z21.h // encoding: [0x55,0x01,0x75,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x01,0xb5,0x04
+# CHECK: add z21.s, z10.s, z21.s // encoding: [0x55,0x01,0xb5,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x01,0x68,0x04
+# CHECK: add z23.h, z13.h, z8.h // encoding: [0xb7,0x01,0x68,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x01,0xa8,0x04
+# CHECK: add z23.s, z13.s, z8.s // encoding: [0xb7,0x01,0xa8,0x04]
+# CHECK-ERROR: invalid instruction encoding
diff --git a/test/MC/AArch64/SVE/disassembler_tests/sub.s b/test/MC/AArch64/SVE/disassembler_tests/sub.s
new file mode 100644
index 00000000000..e7acde952a7
--- /dev/null
+++ b/test/MC/AArch64/SVE/disassembler_tests/sub.s
@@ -0,0 +1,50 @@
+# RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -disassemble -mattr=+sve < %s | FileCheck %s
+# RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding -disassemble -mattr=-sve 2>&1 < %s | FileCheck --check-prefix=CHECK-ERROR %s
+0x00,0x04,0x60,0x04
+# CHECK: sub z0.h, z0.h, z0.h // encoding: [0x00,0x04,0x60,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x05,0x35,0x04
+# CHECK: sub z21.b, z10.b, z21.b // encoding: [0x55,0x05,0x35,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xff,0x07,0x7f,0x04
+# CHECK: sub z31.h, z31.h, z31.h // encoding: [0xff,0x07,0x7f,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x05,0x75,0x04
+# CHECK: sub z21.h, z10.h, z21.h // encoding: [0x55,0x05,0x75,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xff,0x07,0x3f,0x04
+# CHECK: sub z31.b, z31.b, z31.b // encoding: [0xff,0x07,0x3f,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x00,0x04,0xa0,0x04
+# CHECK: sub z0.s, z0.s, z0.s // encoding: [0x00,0x04,0xa0,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x05,0x28,0x04
+# CHECK: sub z23.b, z13.b, z8.b // encoding: [0xb7,0x05,0x28,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x05,0xf5,0x04
+# CHECK: sub z21.d, z10.d, z21.d // encoding: [0x55,0x05,0xf5,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x55,0x05,0xb5,0x04
+# CHECK: sub z21.s, z10.s, z21.s // encoding: [0x55,0x05,0xb5,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x00,0x04,0x20,0x04
+# CHECK: sub z0.b, z0.b, z0.b // encoding: [0x00,0x04,0x20,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x05,0xe8,0x04
+# CHECK: sub z23.d, z13.d, z8.d // encoding: [0xb7,0x05,0xe8,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x05,0xa8,0x04
+# CHECK: sub z23.s, z13.s, z8.s // encoding: [0xb7,0x05,0xa8,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xff,0x07,0xff,0x04
+# CHECK: sub z31.d, z31.d, z31.d // encoding: [0xff,0x07,0xff,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xb7,0x05,0x68,0x04
+# CHECK: sub z23.h, z13.h, z8.h // encoding: [0xb7,0x05,0x68,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0x00,0x04,0xe0,0x04
+# CHECK: sub z0.d, z0.d, z0.d // encoding: [0x00,0x04,0xe0,0x04]
+# CHECK-ERROR: invalid instruction encoding
+0xff,0x07,0xbf,0x04
+# CHECK: sub z31.s, z31.s, z31.s // encoding: [0xff,0x07,0xbf,0x04]
+# CHECK-ERROR: invalid instruction encoding
diff --git a/test/MC/Disassembler/Mips/micromips32r3/valid-el.txt b/test/MC/Disassembler/Mips/micromips32r3/valid-el.txt
index dc76f48a95a..0cd74f5ba71 100644
--- a/test/MC/Disassembler/Mips/micromips32r3/valid-el.txt
+++ b/test/MC/Disassembler/Mips/micromips32r3/valid-el.txt
@@ -27,6 +27,7 @@
0x09 0x46 # CHECK: mfhi $9
0x49 0x46 # CHECK: mflo $9
0x21 0x0f # CHECK: move $25, $1
+0x9a 0x85 # CHECK: movep $4, $21, $18, $17
0xa9 0x45 # CHECK: jrc $9
0xc9 0x45 # CHECK: jalr $9
0xe9 0x45 # CHECK: jalrs16 $9
diff --git a/test/MC/Disassembler/Mips/micromips32r3/valid.txt b/test/MC/Disassembler/Mips/micromips32r3/valid.txt
index 38d6897e1c4..dbab070b874 100644
--- a/test/MC/Disassembler/Mips/micromips32r3/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips32r3/valid.txt
@@ -27,6 +27,7 @@
0x46 0x09 # CHECK: mfhi $9
0x46 0x49 # CHECK: mflo $9
0x0f 0x21 # CHECK: move $25, $1
+0x85 0x9a # CHECK: movep $4, $21, $18, $17
0x45 0xa9 # CHECK: jrc $9
0x45 0xc9 # CHECK: jalr $9
0x45 0xe9 # CHECK: jalrs16 $9
diff --git a/test/MC/Disassembler/Mips/micromips32r6/valid.txt b/test/MC/Disassembler/Mips/micromips32r6/valid.txt
index f32f2532c24..462866d3347 100644
--- a/test/MC/Disassembler/Mips/micromips32r6/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips32r6/valid.txt
@@ -21,7 +21,7 @@
0x29 0x82 # CHECK: lhu16 $3, 4($16)
0x09 0x94 # CHECK: lbu16 $3, 4($17)
0x09 0x9f # CHECK: lbu16 $3, -1($17)
-0x84 0x34 # CHECK: movep $5, $6, $2, $3
+0x44 0x36 # CHECK: movep $5, $6, $2, $3
0x04 0xcc # CHECK: addu16 $6, $17, $4
0x44 0x21 # CHECK: and16 $16, $2
0x2e 0x56 # CHECK: andi16 $4, $5, 8
diff --git a/test/MC/Disassembler/Mips/micromips64r6/valid.txt b/test/MC/Disassembler/Mips/micromips64r6/valid.txt
index 9186e66d4d0..07cea0d77c5 100644
--- a/test/MC/Disassembler/Mips/micromips64r6/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips64r6/valid.txt
@@ -23,7 +23,7 @@
0x45 0x2b # CHECK: jalr $9
0x45 0x23 # CHECK: jrc16 $9
0x44 0xb3 # CHECK: jrcaddiusp 20
-0x84 0x34 # CHECK: movep $5, $6, $2, $3
+0x44 0x36 # CHECK: movep $5, $6, $2, $3
0x45 0xf9 # CHECK: or16 $3, $7
0x60 0x44 0x30 0x08 # CHECK: ll $2, 8($4)
0x20 0x44 0x50 0x08 # CHECK: lwm32 $16, $17, 8($4)
diff --git a/test/MC/Disassembler/X86/prefixes-i386.txt b/test/MC/Disassembler/X86/prefixes-i386.txt
index ff2fb223873..3152cc31aad 100644
--- a/test/MC/Disassembler/X86/prefixes-i386.txt
+++ b/test/MC/Disassembler/X86/prefixes-i386.txt
@@ -3,85 +3,59 @@
# CHECK: movl %fs:24, %eax
0x64 0xa1 0x18 0x00 0x00 0x00 # mov eax, dword ptr fs:[18h]
-# CHECK: rep
-# CHECK-NEXT: insb %dx, %es:(%edi)
+# CHECK: rep insb %dx, %es:(%edi)
0xf3 0x6c #rep ins
-# CHECK: rep
-# CHECK-NEXT: insl %dx, %es:(%edi)
+# CHECK: rep insl %dx, %es:(%edi)
0xf3 0x6d #rep ins
-# CHECK: rep
-# CHECK-NEXT: movsb (%esi), %es:(%edi)
+# CHECK: rep movsb (%esi), %es:(%edi)
0xf3 0xa4 #rep movs
-# CHECK: rep
-# CHECK-NEXT: movsl (%esi), %es:(%edi)
+# CHECK: rep movsl (%esi), %es:(%edi)
0xf3 0xa5 #rep movs
-# CHECK: rep
-# CHECK-NEXT: outsb (%esi), %dx
+# CHECK: rep outsb (%esi), %dx
0xf3 0x6e #rep outs
-# CHECK: rep
-# CHECK-NEXT: outsl (%esi), %dx
+# CHECK: rep outsl (%esi), %dx
0xf3 0x6f #rep outs
-# CHECK: rep
-# CHECK-NEXT: lodsb (%esi), %al
+# CHECK: rep lodsb (%esi), %al
0xf3 0xac #rep lods
-# CHECK: rep
-# CHECK-NEXT: lodsl (%esi), %eax
+# CHECK: rep lodsl (%esi), %eax
0xf3 0xad #rep lods
-# CHECK: rep
-# CHECK-NEXT: stosb %al, %es:(%edi)
+# CHECK: rep stosb %al, %es:(%edi)
0xf3 0xaa #rep stos
-# CHECK: rep
-# CHECK-NEXT: stosl %eax, %es:(%edi)
+# CHECK: rep stosl %eax, %es:(%edi)
0xf3 0xab #rep stos
-# CHECK: rep
-# CHECK-NEXT: cmpsb %es:(%edi), (%esi)
+# CHECK: rep cmpsb %es:(%edi), (%esi)
0xf3 0xa6 #rep cmps
-# CHECK: rep
-# CHECK-NEXT: cmpsl %es:(%edi), (%esi)
+# CHECK: rep cmpsl %es:(%edi), (%esi)
0xf3 0xa7 #repe cmps
-# CHECK: rep
-# CHECK-NEXT: scasb %es:(%edi), %al
+# CHECK: rep scasb %es:(%edi), %al
0xf3 0xae #repe scas
-# CHECK: rep
-# CHECK-NEXT: scasl %es:(%edi), %eax
+# CHECK: rep scasl %es:(%edi), %eax
0xf3 0xaf #repe scas
-# CHECK: repne
-# CHECK-NEXT: cmpsb %es:(%edi), (%esi)
+# CHECK: repne cmpsb %es:(%edi), (%esi)
0xf2 0xa6 #repne cmps
-# CHECK: repne
-# CHECK-NEXT: cmpsl %es:(%edi), (%esi)
+# CHECK: repne cmpsl %es:(%edi), (%esi)
0xf2 0xa7 #repne cmps
-# CHECK: repne
-# CHECK-NEXT: scasb %es:(%edi), %al
+# CHECK: repne scasb %es:(%edi), %al
0xf2 0xae #repne scas
-# CHECK: repne
-# CHECK-NEXT: scasl %es:(%edi), %eax
+# CHECK: repne scasl %es:(%edi), %eax
0xf2 0xaf #repne scas
-# CHECK: repne
-# CHECK-NEXT: scasw %es:(%edi), %ax
+# CHECK: repne scasw %es:(%edi), %ax
0xf2 0x66 0xaf
-# CHECK: repne
-# CHECK-NEXT: scasw %es:(%edi), %ax
+# CHECK: repne scasw %es:(%edi), %ax
0x66 0xf2 0xaf
-# CHECK: rep
-# CHECK-NEXT: scasw %es:(%edi), %ax
+# CHECK: rep scasw %es:(%edi), %ax
0xf3 0x66 0xaf
-# CHECK: rep
-# CHECK-NEXT: scasw %es:(%edi), %ax
+# CHECK: rep scasw %es:(%edi), %ax
0x66 0xf3 0xaf
-# CHECK: repne
-# CHECK: insw %dx, %es:(%edi)
+# CHECK: repne insw %dx, %es:(%edi)
0xf2 0x66 0x6d
-# CHECK: repne
-# CHECK: insw %dx, %es:(%edi)
+# CHECK: repne insw %dx, %es:(%edi)
0x66 0xf2 0x6d
-# CHECK: rep
-# CHECK: insw %dx, %es:(%edi)
+# CHECK: rep insw %dx, %es:(%edi)
0xf3 0x66 0x6d
-# CHECK: rep
-# CHECK: insw %dx, %es:(%edi)
+# CHECK: rep insw %dx, %es:(%edi)
0x66 0xf3 0x6d
diff --git a/test/MC/Disassembler/X86/prefixes-x86_64.txt b/test/MC/Disassembler/X86/prefixes-x86_64.txt
index 7a9208f7b63..c9bf512aa75 100644
--- a/test/MC/Disassembler/X86/prefixes-x86_64.txt
+++ b/test/MC/Disassembler/X86/prefixes-x86_64.txt
@@ -9,30 +9,22 @@
# CHECK: mulsd %xmm7, %xmm7
0xf2 0x66 0x0f 0x59 0xff
-# CHECK: repne
-# CHECK-NEXT: scasw %es:(%rdi), %ax
+# CHECK: repne scasw %es:(%rdi), %ax
0xf2 0x66 0xaf
-# CHECK: rep
-# CHECK-NEXT: scasw %es:(%rdi), %ax
+# CHECK: repne scasw %es:(%rdi), %ax
0x66 0xf2 0xaf
-# CHECK: rep
-# CHECK-NEXT: scasw %es:(%rdi), %ax
+# CHECK: rep scasw %es:(%rdi), %ax
0xf3 0x66 0xaf
-# CHECK: rep
-# CHECK-NEXT: scasw %es:(%rdi), %ax
+# CHECK: rep scasw %es:(%rdi), %ax
0x66 0xf3 0xaf
-# CHECK: repne
-# CHECK: insw %dx, %es:(%rdi)
+# CHECK: repne insw %dx, %es:(%rdi)
0xf2 0x66 0x6d
-# CHECK: repne
-# CHECK: insw %dx, %es:(%rdi)
+# CHECK: repne insw %dx, %es:(%rdi)
0x66 0xf2 0x6d
-# CHECK: rep
-# CHECK: insw %dx, %es:(%rdi)
+# CHECK: rep insw %dx, %es:(%rdi)
0xf3 0x66 0x6d
-# CHECK: rep
-# CHECK: insw %dx, %es:(%rdi)
+# CHECK: rep insw %dx, %es:(%rdi)
0x66 0xf3 0x6d
diff --git a/test/MC/Disassembler/X86/prefixes.txt b/test/MC/Disassembler/X86/prefixes.txt
index 983e09670d6..75e11ae93f4 100644
--- a/test/MC/Disassembler/X86/prefixes.txt
+++ b/test/MC/Disassembler/X86/prefixes.txt
@@ -1,73 +1,53 @@
# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s
-# CHECK: rep
-# CHECK-NEXT: insb %dx, %es:(%rdi)
+# CHECK: rep insb %dx, %es:(%rdi)
0xf3 0x6c #rep ins
-# CHECK: rep
-# CHECK-NEXT: insl %dx, %es:(%rdi)
+# CHECK: rep insl %dx, %es:(%rdi)
0xf3 0x6d #rep ins
-# CHECK: rep
-# CHECK-NEXT: movsb (%rsi), %es:(%rdi)
+# CHECK: rep movsb (%rsi), %es:(%rdi)
0xf3 0xa4 #rep movs
-# CHECK: rep
-# CHECK-NEXT: movsl (%rsi), %es:(%rdi)
+# CHECK: rep movsl (%rsi), %es:(%rdi)
0xf3 0xa5 #rep movs
-# CHECK: rep
-# CHECK-NEXT: outsb (%rsi), %dx
+# CHECK: rep outsb (%rsi), %dx
0xf3 0x6e #rep outs
-# CHECK: rep
-# CHECK-NEXT: outsl (%rsi), %dx
+# CHECK: rep outsl (%rsi), %dx
0xf3 0x6f #rep outs
-# CHECK: rep
-# CHECK-NEXT: lodsb (%rsi), %al
+# CHECK: rep lodsb (%rsi), %al
0xf3 0xac #rep lods
-# CHECK: rep
-# CHECK-NEXT: lodsl (%rsi), %eax
+# CHECK: rep lodsl (%rsi), %eax
0xf3 0xad #rep lods
-# CHECK: rep
-# CHECK-NEXT: stosb %al, %es:(%rdi)
+# CHECK: rep stosb %al, %es:(%rdi)
0xf3 0xaa #rep stos
-# CHECK: rep
-# CHECK-NEXT: stosl %eax, %es:(%rdi)
+# CHECK: rep stosl %eax, %es:(%rdi)
0xf3 0xab #rep stos
-# CHECK: rep
-# CHECK-NEXT: cmpsb %es:(%rdi), (%rsi)
+# CHECK: rep cmpsb %es:(%rdi), (%rsi)
0xf3 0xa6 #rep cmps
-# CHECK: rep
-# CHECK-NEXT: cmpsl %es:(%rdi), (%rsi)
+# CHECK: rep cmpsl %es:(%rdi), (%rsi)
0xf3 0xa7 #repe cmps
-# CHECK: rep
-# CHECK-NEXT: scasb %es:(%rdi), %al
+# CHECK: rep scasb %es:(%rdi), %al
0xf3 0xae #repe scas
-# CHECK: rep
-# CHECK-NEXT: scasl %es:(%rdi), %eax
+# CHECK: rep scasl %es:(%rdi), %eax
0xf3 0xaf #repe scas
-# CHECK: repne
-# CHECK-NEXT: cmpsb %es:(%rdi), (%rsi)
+# CHECK: repne cmpsb %es:(%rdi), (%rsi)
0xf2 0xa6 #repne cmps
-# CHECK: repne
-# CHECK-NEXT: cmpsl %es:(%rdi), (%rsi)
+# CHECK: repne cmpsl %es:(%rdi), (%rsi)
0xf2 0xa7 #repne cmps
-# CHECK: repne
-# CHECK-NEXT: scasb %es:(%rdi), %al
+# CHECK: repne scasb %es:(%rdi), %al
0xf2 0xae #repne scas
-# CHECK: repne
-# CHECK-NEXT: scasl %es:(%rdi), %eax
+# CHECK: repne scasl %es:(%rdi), %eax
0xf2 0xaf #repne scas
# CHECK: lock
-# CHECK-NEXT: orl $16, %fs:776
+# CHECK-NEXT: orl $16, %fs:776
0xf0 0x64 0x83 0x0c 0x25 0x08 0x03 0x00 0x00 0x10
# CHECK: movq %fs:768, %rdi
0x64 0x48 0x8b 0x3c 0x25 0x00 0x03 0x00 0x00
-# CHECK: rep
-# CHECK-NEXT: stosq %rax, %es:(%rdi)
+# CHECK: rep stosq %rax, %es:(%rdi)
0xf3 0x48 0xab
-# CHECK: rep
-# CHECK-NEXT: stosq %rax, %es:(%edi)
+# CHECK: rep stosq %rax, %es:(%edi)
0xf3 0x67 0x48 0xab
# CHECK: movl 32(%rbp), %eax
@@ -104,11 +84,9 @@
0x66,0x83,0xc0,0xf4
# Test that multiple redundant prefixes work (redundant, but valid x86).
-# CHECK: rep
-# CHECK-NEXT: stosq
+# CHECK: rep stosq
0xf3 0xf3 0x48 0xab
-
# Test that we can disassembler control registers above CR8
# CHECK: movq %cr15, %rax
0x44 0x0f 0x20 0xf8
diff --git a/test/MC/Disassembler/X86/simple-tests.txt b/test/MC/Disassembler/X86/simple-tests.txt
index 86d9f92fbbf..39074934164 100644
--- a/test/MC/Disassembler/X86/simple-tests.txt
+++ b/test/MC/Disassembler/X86/simple-tests.txt
@@ -851,14 +851,11 @@
0xf0 0x48 0x0f 0xc1 0xcb
# rdar://13493622 lldb doesn't print the x86 rep/repne prefix when disassembling
-# CHECK: repne
-# CHECK-NEXT: movsl
+# CHECK: repne movsl
0xf2 0xa5
-# CHECK: repne
-# CHECK-NEXT: movsq
+# CHECK: repne movsq
0xf2 0x48 0xa5
-# CHECK: repne
-# CHECK-NEXT: movb $0, (%rax)
+# CHECK: repne movb $0, (%rax)
0xf2 0xc6 0x0 0x0
# rdar://11019859 Support 2013 Haswell RTM instructions and HLE prefixes
diff --git a/test/MC/Mips/micromips32r6/valid.s b/test/MC/Mips/micromips32r6/valid.s
index 66fcf72ec7f..b47924453cb 100644
--- a/test/MC/Mips/micromips32r6/valid.s
+++ b/test/MC/Mips/micromips32r6/valid.s
@@ -84,7 +84,7 @@
lwm32 $16, $17, $18, $19, $20, $21, $22, $23, $fp, 8($4) # CHECK: lwm32 $16, $17, $18, $19, $20, $21, $22, $23, $fp, 8($4) # encoding: [0x21,0x24,0x50,0x08]
lwm32 $16, $17, $18, $19, $20, $21, $22, $23, $fp, $ra, 8($4) # CHECK: lwm32 $16, $17, $18, $19, $20, $21, $22, $23, $fp, $ra, 8($4) # encoding: [0x23,0x24,0x50,0x08]
lwm32 $16, $17, $18, $19, $20, $21, $22, $23, $fp, $ra, 8($4) # CHECK: lwm32 $16, $17, $18, $19, $20, $21, $22, $23, $fp, $ra, 8($4) # encoding: [0x23,0x24,0x50,0x08]
- movep $5, $6, $2, $3 # CHECK: movep $5, $6, $2, $3 # encoding: [0x84,0x34]
+ movep $5, $6, $2, $3 # CHECK: movep $5, $6, $2, $3 # encoding: [0x44,0x36]
rotr $2, 7 # CHECK: rotr $2, $2, 7 # encoding: [0x00,0x42,0x38,0xc0]
rotr $9, $6, 7 # CHECK: rotr $9, $6, 7 # encoding: [0x01,0x26,0x38,0xc0]
rotrv $9, $6, $7 # CHECK: rotrv $9, $6, $7 # encoding: [0x00,0xc7,0x48,0xd0]
diff --git a/test/MC/Mips/micromips64r6/valid.s b/test/MC/Mips/micromips64r6/valid.s
index 641e16c1457..a2acedb03c0 100644
--- a/test/MC/Mips/micromips64r6/valid.s
+++ b/test/MC/Mips/micromips64r6/valid.s
@@ -35,7 +35,7 @@ a:
lhu16 $3, 4($16) # CHECK: lhu16 $3, 4($16) # encoding: [0x29,0x82]
lbu16 $3, 4($17) # CHECK: lbu16 $3, 4($17) # encoding: [0x09,0x94]
lbu16 $3, -1($17) # CHECK: lbu16 $3, -1($17) # encoding: [0x09,0x9f]
- movep $5, $6, $2, $3 # CHECK: movep $5, $6, $2, $3 # encoding: [0x84,0x34]
+ movep $5, $6, $2, $3 # CHECK: movep $5, $6, $2, $3 # encoding: [0x44,0x36]
not16 $4, $7 # CHECK: not16 $4, $7 # encoding: [0x46,0x70]
or16 $3, $7 # CHECK: or16 $3, $7 # encoding: [0x45,0xf9]
ll $2, 8($4) # CHECK: ll $2, 8($4) # encoding: [0x60,0x44,0x30,0x08]
diff --git a/test/MC/Mips/tls-symbols.s b/test/MC/Mips/tls-symbols.s
new file mode 100644
index 00000000000..d5a31b18950
--- /dev/null
+++ b/test/MC/Mips/tls-symbols.s
@@ -0,0 +1,28 @@
+# RUN: llvm-mc -arch=mips < %s -position-independent -filetype=obj \
+# RUN: | llvm-readelf -symbols | FileCheck %s
+# RUN: llvm-mc -arch=mips < %s -filetype=obj | llvm-readelf -symbols \
+# RUN: | FileCheck %s
+
+# Test that TLS relocations cause symbols to be marked as TLS symbols.
+
+ .set noat
+ lui $3, %tlsgd(foo1)
+ lui $1, %dtprel_hi(foo2)
+ lui $1, %dtprel_lo(foo3)
+ lui $1, %tprel_hi(foo4)
+ lui $1, %tprel_lo(foo5)
+ lw $2, %gottprel(foo6)($28)
+
+ .hidden foo1
+ .hidden foo2
+ .hidden foo3
+ .hidden foo4
+ .hidden foo5
+ .hidden foo6
+
+# CHECK: 1: {{.+}} {{.+}} TLS GLOBAL HIDDEN UND foo1
+# CHECK: 2: {{.+}} {{.+}} TLS GLOBAL HIDDEN UND foo2
+# CHECK: 3: {{.+}} {{.+}} TLS GLOBAL HIDDEN UND foo3
+# CHECK: 4: {{.+}} {{.+}} TLS GLOBAL HIDDEN UND foo4
+# CHECK: 5: {{.+}} {{.+}} TLS GLOBAL HIDDEN UND foo5
+# CHECK: 6: {{.+}} {{.+}} TLS GLOBAL HIDDEN UND foo6
diff --git a/test/Object/Inputs/trivial-object-test.coff-arm64 b/test/Object/Inputs/trivial-object-test.coff-arm64
new file mode 100644
index 00000000000..0d23aa29524
--- /dev/null
+++ b/test/Object/Inputs/trivial-object-test.coff-arm64
Binary files differ
diff --git a/test/Object/Inputs/trivial-object-test.coff-armnt b/test/Object/Inputs/trivial-object-test.coff-armnt
new file mode 100644
index 00000000000..5bbf79f1b54
--- /dev/null
+++ b/test/Object/Inputs/trivial-object-test.coff-armnt
Binary files differ
diff --git a/test/Object/archive-SYM64-write.test b/test/Object/archive-SYM64-write.test
new file mode 100644
index 00000000000..161d6cb8191
--- /dev/null
+++ b/test/Object/archive-SYM64-write.test
@@ -0,0 +1,38 @@
+# REQUIRES: llvm-64-bits
+# REQUIRES: system-linux
+
+# RUN: yaml2obj %s > %t
+# RUN: dd if=%t of=%t bs=1 count=0 seek=2200M
+# RUN: rm -f %t.lib
+# RUN: cp %t %t2
+# RUN: llvm-ar cr %t.lib %t %t2 %p/Inputs/trivial-object-test.elf-x86-64
+# RUN: llvm-nm --print-armap %t.lib | FileCheck %s
+
+# Delete temp files. They are too large.
+# RUN: rm -f %t %t2 %t.lib
+
+!ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_EXEC
+ Machine: EM_X86_64
+Sections:
+ - Name: .data
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ AddressAlign: 0x0000000000000001
+ Content: "00"
+ Size: 32
+
+# CHECK: Archive map
+# CHECK-NEXT: main in trivial-object-test.elf-x86-64
+
+# CHECK: archive-SYM64-write.test.tmp:
+
+# CHECK: archive-SYM64-write.test.tmp2:
+
+# CHECK: trivial-object-test.elf-x86-64:
+# CHECK-NEXT: U SomeOtherFunction
+# CHECK-NEXT: 0000000000000000 T main
+# CHECK-NEXT: U puts
diff --git a/test/Object/obj2yaml.test b/test/Object/obj2yaml.test
index 3d89f53bafc..7b274b31bb1 100644
--- a/test/Object/obj2yaml.test
+++ b/test/Object/obj2yaml.test
@@ -1,5 +1,7 @@
RUN: obj2yaml %p/Inputs/trivial-object-test.coff-i386 | FileCheck %s --check-prefix COFF-I386
RUN: obj2yaml %p/Inputs/trivial-object-test.coff-x86-64 | FileCheck %s --check-prefix COFF-X86-64
+RUN: obj2yaml %p/Inputs/trivial-object-test.coff-armnt | FileCheck %s --check-prefix COFF-ARMNT
+RUN: obj2yaml %p/Inputs/trivial-object-test.coff-arm64 | FileCheck %s --check-prefix COFF-ARM64
RUN: obj2yaml %p/Inputs/trivial-object-test.elf-mipsel | FileCheck %s --check-prefix ELF-MIPSEL
RUN: obj2yaml %p/Inputs/trivial-object-test.elf-mips64el | FileCheck %s --check-prefix ELF-MIPS64EL
RUN: obj2yaml %p/Inputs/trivial-object-test.elf-x86-64 | FileCheck %s --check-prefix ELF-X86-64
@@ -189,6 +191,162 @@ COFF-X86-64-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
COFF-X86-64-NEXT: ComplexType: IMAGE_SYM_DTYPE_FUNCTION
COFF-X86-64-NEXT: StorageClass: IMAGE_SYM_CLASS_STATIC
+COFF-ARMNT: header:
+COFF-ARMNT-NEXT: Machine: IMAGE_FILE_MACHINE_ARMNT
+
+COFF-ARMNT: sections:
+COFF-ARMNT-NEXT: - Name: .text
+COFF-ARMNT-NEXT: Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_PURGEABLE, IMAGE_SCN_MEM_16BIT, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+COFF-ARMNT-NEXT: Alignment: 4
+COFF-ARMNT-NEXT: SectionData: 00F000F87047
+
+COFF-ARMNT: Relocations:
+COFF-ARMNT-NEXT: - VirtualAddress: 0
+COFF-ARMNT-NEXT: SymbolName: otherFunc
+COFF-ARMNT-NEXT: Type: IMAGE_REL_ARM_BLX23T
+
+COFF-ARMNT: - Name: .data
+COFF-ARMNT-NEXT: Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+COFF-ARMNT-NEXT: Alignment: 4
+COFF-ARMNT-NEXT: SectionData: ''
+
+COFF-ARMNT: - Name: .bss
+COFF-ARMNT-NEXT: Characteristics: [ IMAGE_SCN_CNT_UNINITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+COFF-ARMNT-NEXT: Alignment: 4
+COFF-ARMNT-NEXT: SectionData: ''
+
+COFF-ARMNT: symbols:
+COFF-ARMNT-NEXT: - Name: .text
+COFF-ARMNT-NEXT: Value: 0
+COFF-ARMNT-NEXT: SectionNumber: 1
+COFF-ARMNT-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARMNT-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARMNT-NEXT: StorageClass: IMAGE_SYM_CLASS_STATIC
+COFF-ARMNT-NEXT: SectionDefinition:
+COFF-ARMNT-NEXT: Length: 6
+COFF-ARMNT-NEXT: NumberOfRelocations: 1
+COFF-ARMNT-NEXT: NumberOfLinenumbers: 0
+COFF-ARMNT-NEXT: CheckSum: 879026160
+COFF-ARMNT-NEXT: Number: 1
+
+COFF-ARMNT: - Name: .data
+COFF-ARMNT-NEXT: Value: 0
+COFF-ARMNT-NEXT: SectionNumber: 2
+COFF-ARMNT-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARMNT-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARMNT-NEXT: StorageClass: IMAGE_SYM_CLASS_STATIC
+COFF-ARMNT-NEXT: SectionDefinition:
+COFF-ARMNT-NEXT: Length: 0
+COFF-ARMNT-NEXT: NumberOfRelocations: 0
+COFF-ARMNT-NEXT: NumberOfLinenumbers: 0
+COFF-ARMNT-NEXT: CheckSum: 0
+COFF-ARMNT-NEXT: Number: 2
+
+COFF-ARMNT: - Name: .bss
+COFF-ARMNT-NEXT: Value: 0
+COFF-ARMNT-NEXT: SectionNumber: 3
+COFF-ARMNT-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARMNT-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARMNT-NEXT: StorageClass: IMAGE_SYM_CLASS_STATIC
+COFF-ARMNT-NEXT: SectionDefinition:
+COFF-ARMNT-NEXT: Length: 0
+COFF-ARMNT-NEXT: NumberOfRelocations: 0
+COFF-ARMNT-NEXT: NumberOfLinenumbers: 0
+COFF-ARMNT-NEXT: CheckSum: 0
+COFF-ARMNT-NEXT: Number: 3
+
+COFF-ARMNT: - Name: main
+COFF-ARMNT-NEXT: Value: 0
+COFF-ARMNT-NEXT: SectionNumber: 1
+COFF-ARMNT-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARMNT-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARMNT-NEXT: StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+
+COFF-ARMNT: - Name: otherFunc
+COFF-ARMNT-NEXT: Value: 0
+COFF-ARMNT-NEXT: SectionNumber: 0
+COFF-ARMNT-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARMNT-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARMNT-NEXT: StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+
+COFF-ARM64: header:
+COFF-ARM64-NEXT: Machine: IMAGE_FILE_MACHINE_ARM64
+
+COFF-ARM64: sections:
+COFF-ARM64-NEXT: - Name: .text
+COFF-ARM64-NEXT: Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+COFF-ARM64-NEXT: Alignment: 4
+COFF-ARM64-NEXT: SectionData: 00000094C0035FD6
+
+COFF-ARM64: Relocations:
+COFF-ARM64-NEXT: - VirtualAddress: 0
+COFF-ARM64-NEXT: SymbolName: otherFunc
+COFF-ARM64-NEXT: Type: IMAGE_REL_ARM64_BRANCH26
+
+COFF-ARM64: - Name: .data
+COFF-ARM64-NEXT: Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+COFF-ARM64-NEXT: Alignment: 4
+COFF-ARM64-NEXT: SectionData: ''
+
+COFF-ARM64: - Name: .bss
+COFF-ARM64-NEXT: Characteristics: [ IMAGE_SCN_CNT_UNINITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+COFF-ARM64-NEXT: Alignment: 4
+COFF-ARM64-NEXT: SectionData: ''
+
+COFF-ARM64: symbols:
+COFF-ARM64-NEXT: - Name: .text
+COFF-ARM64-NEXT: Value: 0
+COFF-ARM64-NEXT: SectionNumber: 1
+COFF-ARM64-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARM64-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARM64-NEXT: StorageClass: IMAGE_SYM_CLASS_STATIC
+COFF-ARM64-NEXT: SectionDefinition:
+COFF-ARM64-NEXT: Length: 8
+COFF-ARM64-NEXT: NumberOfRelocations: 1
+COFF-ARM64-NEXT: NumberOfLinenumbers: 0
+COFF-ARM64-NEXT: CheckSum: 35579893
+COFF-ARM64-NEXT: Number: 1
+
+COFF-ARM64: - Name: .data
+COFF-ARM64-NEXT: Value: 0
+COFF-ARM64-NEXT: SectionNumber: 2
+COFF-ARM64-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARM64-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARM64-NEXT: StorageClass: IMAGE_SYM_CLASS_STATIC
+COFF-ARM64-NEXT: SectionDefinition:
+COFF-ARM64-NEXT: Length: 0
+COFF-ARM64-NEXT: NumberOfRelocations: 0
+COFF-ARM64-NEXT: NumberOfLinenumbers: 0
+COFF-ARM64-NEXT: CheckSum: 0
+COFF-ARM64-NEXT: Number: 2
+
+COFF-ARM64: - Name: .bss
+COFF-ARM64-NEXT: Value: 0
+COFF-ARM64-NEXT: SectionNumber: 3
+COFF-ARM64-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARM64-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARM64-NEXT: StorageClass: IMAGE_SYM_CLASS_STATIC
+COFF-ARM64-NEXT: SectionDefinition:
+COFF-ARM64-NEXT: Length: 0
+COFF-ARM64-NEXT: NumberOfRelocations: 0
+COFF-ARM64-NEXT: NumberOfLinenumbers: 0
+COFF-ARM64-NEXT: CheckSum: 0
+COFF-ARM64-NEXT: Number: 3
+
+COFF-ARM64: - Name: main
+COFF-ARM64-NEXT: Value: 0
+COFF-ARM64-NEXT: SectionNumber: 1
+COFF-ARM64-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARM64-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARM64-NEXT: StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+
+COFF-ARM64: - Name: otherFunc
+COFF-ARM64-NEXT: Value: 0
+COFF-ARM64-NEXT: SectionNumber: 0
+COFF-ARM64-NEXT: SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-ARM64-NEXT: ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-ARM64-NEXT: StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+
ELF-MIPSEL: FileHeader:
ELF-MIPSEL-NEXT: Class: ELFCLASS32
ELF-MIPSEL-NEXT: Data: ELFDATA2LSB
diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll
index 816f75310e3..0810a13c141 100644
--- a/test/Other/new-pm-defaults.ll
+++ b/test/Other/new-pm-defaults.ll
@@ -76,6 +76,7 @@
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass
+; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass
; CHECK-O-NEXT: Finished llvm::Function pass manager run.
; CHECK-O-NEXT: Running pass: IPSCCPPass
; CHECK-O-NEXT: Running pass: CalledValuePropagationPass
diff --git a/test/Other/new-pm-lto-defaults.ll b/test/Other/new-pm-lto-defaults.ll
index fc52f70ff4c..878198d1447 100644
--- a/test/Other/new-pm-lto-defaults.ll
+++ b/test/Other/new-pm-lto-defaults.ll
@@ -29,9 +29,14 @@
; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass
; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass
; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
+; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Module
+; CHECK-O2-NEXT: Starting llvm::Function pass manager run.
+; CHECK-O2-NEXT: Running pass: CallSiteSplittingPass on foo
+; CHECK-O2-NEXT: Running analysis: TargetLibraryAnalysis on foo
+; CHECK-O2-NEXT: Finished llvm::Function pass manager run.
; CHECK-O2-NEXT: PGOIndirectCallPromotion
; CHECK-O2-NEXT: Running analysis: ProfileSummaryAnalysis
-; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Function
; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
; CHECK-O2-NEXT: Running pass: IPSCCPPass
; CHECK-O2-NEXT: Running pass: CalledValuePropagationPass
@@ -42,7 +47,7 @@
; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy
; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}>
; CHECK-O-NEXT: Running analysis: AAManager
-; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-O1-NEXT: Running analysis: TargetLibraryAnalysis
; CHECK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass
; CHECK-O-NEXT: Running analysis: CallGraphAnalysis
; CHECK-O-NEXT: Running pass: GlobalSplitPass
diff --git a/test/Other/new-pm-thinlto-defaults.ll b/test/Other/new-pm-thinlto-defaults.ll
index 7d40ef3eea2..e83f0f87055 100644
--- a/test/Other/new-pm-thinlto-defaults.ll
+++ b/test/Other/new-pm-thinlto-defaults.ll
@@ -72,6 +72,7 @@
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass
+; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass
; CHECK-O-NEXT: Finished llvm::Function pass manager run.
; CHECK-O-NEXT: Running pass: IPSCCPPass
; CHECK-O-NEXT: Running pass: CalledValuePropagationPass
diff --git a/test/ThinLTO/X86/deadstrip.ll b/test/ThinLTO/X86/deadstrip.ll
index c19ccb01be3..90de3bb9a32 100644
--- a/test/ThinLTO/X86/deadstrip.ll
+++ b/test/ThinLTO/X86/deadstrip.ll
@@ -18,8 +18,8 @@
; RUN: -r %t2.bc,_boo,pl \
; RUN: -r %t2.bc,_dead_func,pl \
; RUN: -r %t2.bc,_another_dead_func,pl
-; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s
-; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=CHECK2
+; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s --check-prefix=LTO2
+; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=LTO2-CHECK2
; RUN: llvm-nm %t.out.1 | FileCheck %s --check-prefix=CHECK2-NM
; RUN: llvm-bcanalyzer -dump %t.out.index.bc | FileCheck %s --check-prefix=COMBINED
@@ -27,14 +27,14 @@
; COMBINED-DAG: <COMBINED {{.*}} op2=55
; Live, Internal
; COMBINED-DAG: <COMBINED {{.*}} op2=39
-; Live, External
-; COMBINED-DAG: <COMBINED {{.*}} op2=32
-; COMBINED-DAG: <COMBINED {{.*}} op2=32
-; COMBINED-DAG: <COMBINED {{.*}} op2=32
-; (Dead)
-; COMBINED-DAG: <COMBINED {{.*}} op2=0
-; COMBINED-DAG: <COMBINED {{.*}} op2=0
-; COMBINED-DAG: <COMBINED {{.*}} op2=0
+; Live, Local, External
+; COMBINED-DAG: <COMBINED {{.*}} op2=96
+; COMBINED-DAG: <COMBINED {{.*}} op2=96
+; COMBINED-DAG: <COMBINED {{.*}} op2=96
+; Local, (Dead)
+; COMBINED-DAG: <COMBINED {{.*}} op2=64
+; COMBINED-DAG: <COMBINED {{.*}} op2=64
+; COMBINED-DAG: <COMBINED {{.*}} op2=64
; Dead-stripping on the index allows to internalize these,
; and limit the import of @baz thanks to early pruning.
@@ -45,10 +45,18 @@
; CHECK: define internal void @bar_internal()
; CHECK: define internal void @dead_func() {
; CHECK-NOT: available_externally {{.*}} @baz()
+; LTO2-NOT: available_externally {{.*}} @baz()
+; LTO2: @llvm.global_ctors =
+; LTO2: define internal void @_GLOBAL__I_a()
+; LTO2: define internal dso_local void @bar() {
+; LTO2: define internal void @bar_internal()
+; LTO2: define internal dso_local void @dead_func() {
+; LTO2-NOT: available_externally {{.*}} @baz()
; Make sure we didn't internalize @boo, which is reachable via
; llvm.global_ctors
; CHECK2: define void @boo()
+; LTO2-CHECK2: define dso_local void @boo()
; We should have eventually removed @baz since it was internalized and unused
; CHECK2-NM-NOT: _baz
@@ -80,7 +88,7 @@
; We can't internalize @dead_func because of the use in the regular LTO
; partition.
-; CHECK-NOTDEAD: define void @dead_func()
+; CHECK-NOTDEAD: define dso_local void @dead_func()
; We also can't eliminate @baz because it is in the regular LTO partition
; and called from @dead_func.
; CHECK-NM-NOTDEAD: T _baz
diff --git a/test/ThinLTO/X86/funcimport2.ll b/test/ThinLTO/X86/funcimport2.ll
index 7338f9a9d98..86ce715f4e0 100644
--- a/test/ThinLTO/X86/funcimport2.ll
+++ b/test/ThinLTO/X86/funcimport2.ll
@@ -7,7 +7,7 @@
; RUN: -r=%t2.bc,_main,plx \
; RUN: -r=%t2.bc,_foo,l
; RUN: llvm-dis %t.o.1.3.import.bc -o - | FileCheck %s
-; CHECK: define available_externally void @foo()
+; CHECK: define available_externally dso_local void @foo()
; We shouldn't do any importing at -O0
; rm -f %t.o.1.3.import.bc
@@ -17,7 +17,7 @@
; RUN: -r=%t2.bc,_main,plx \
; RUN: -r=%t2.bc,_foo,l
; RUN: llvm-dis %t.o.1.3.import.bc -o - | FileCheck %s --check-prefix=CHECKO0
-; CHECKO0: declare void @foo(...)
+; CHECKO0: declare dso_local void @foo(...)
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"
diff --git a/test/ThinLTO/X86/internalize.ll b/test/ThinLTO/X86/internalize.ll
index 867e3e5a00a..f40fbcd4b41 100644
--- a/test/ThinLTO/X86/internalize.ll
+++ b/test/ThinLTO/X86/internalize.ll
@@ -1,4 +1,4 @@
-;; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %s -o %t1.bc
; RUN: llvm-lto -thinlto-action=thinlink -o %t.index.bc %t1.bc
; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=REGULAR
; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc -o - --exported-symbol=foo | llvm-dis -o - | FileCheck %s --check-prefix=INTERNALIZE
@@ -7,7 +7,7 @@
; RUN: -r=%t1.bc,_foo,pxl \
; RUN: -r=%t1.bc,_bar,pl \
; RUN: -r=%t1.bc,_linkonce_func,pl
-; RUN: llvm-dis < %t.o.0.2.internalize.bc | FileCheck %s --check-prefix=INTERNALIZE
+; RUN: llvm-dis < %t.o.0.2.internalize.bc | FileCheck %s --check-prefix=INTERNALIZE2
; REGULAR: define void @foo
@@ -16,6 +16,9 @@
; INTERNALIZE: define void @foo
; INTERNALIZE: define internal void @bar
; INTERNALIZE: define internal void @linkonce_func()
+; INTERNALIZE2: define dso_local void @foo
+; INTERNALIZE2: define internal dso_local void @bar
+; INTERNALIZE2: define internal dso_local void @linkonce_func()
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"
@@ -29,4 +32,4 @@ define void @bar() {
}
define linkonce void @linkonce_func() {
ret void
-} \ No newline at end of file
+}
diff --git a/test/ThinLTO/X86/lazyload_metadata.ll b/test/ThinLTO/X86/lazyload_metadata.ll
index a6d46e5586a..4680e462458 100644
--- a/test/ThinLTO/X86/lazyload_metadata.ll
+++ b/test/ThinLTO/X86/lazyload_metadata.ll
@@ -10,13 +10,13 @@
; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \
; RUN: -o /dev/null -stats \
; RUN: 2>&1 | FileCheck %s -check-prefix=LAZY
-; LAZY: 53 bitcode-reader - Number of Metadata records loaded
+; LAZY: 55 bitcode-reader - Number of Metadata records loaded
; LAZY: 2 bitcode-reader - Number of MDStrings loaded
; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \
; RUN: -o /dev/null -disable-ondemand-mds-loading -stats \
; RUN: 2>&1 | FileCheck %s -check-prefix=NOTLAZY
-; NOTLAZY: 62 bitcode-reader - Number of Metadata records loaded
+; NOTLAZY: 64 bitcode-reader - Number of Metadata records loaded
; NOTLAZY: 7 bitcode-reader - Number of MDStrings loaded
diff --git a/test/ThinLTO/X86/reference_non_importable.ll b/test/ThinLTO/X86/reference_non_importable.ll
index 5cf225e95de..99b79ce198e 100644
--- a/test/ThinLTO/X86/reference_non_importable.ll
+++ b/test/ThinLTO/X86/reference_non_importable.ll
@@ -22,7 +22,7 @@ target triple = "x86_64-apple-macosx10.11.0"
; We want foo to be imported in the main module!
; RUN: llvm-dis < %t.o.1.3.import.bc | FileCheck %s --check-prefix=IMPORT
-; IMPORT: define available_externally i8** @foo()
+; IMPORT: define available_externally dso_local i8** @foo()
define i8 **@foo() {
ret i8 **@b
}
diff --git a/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
new file mode 100644
index 00000000000..d1d854d8f45
--- /dev/null
+++ b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
@@ -0,0 +1,339 @@
+; RUN: opt < %s -callsite-splitting -S | FileCheck %s
+; RUN: opt < %s -passes='function(callsite-splitting)' -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linaro-linux-gnueabi"
+
+;CHECK-LABEL: @test_eq_eq
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_eq_eq(i32* %a, i32 %v) {
+Header:
+ %tobool1 = icmp eq i32* %a, null
+ br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+ %cmp = icmp eq i32 %v, 1
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_eq
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_eq(i32* %a, i32 %v) {
+Header:
+ %tobool1 = icmp ne i32* %a, null
+ br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+ %cmp = icmp eq i32 %v, 1
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_ne
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_ne(i32* %a, i32 %v) {
+Header:
+ %tobool1 = icmp ne i32* %a, null
+ br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+ %cmp = icmp ne i32 %v, 1
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_eq_eq_untaken
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_eq_eq_untaken(i32* %a, i32 %v) {
+Header:
+ %tobool1 = icmp eq i32* %a, null
+ br i1 %tobool1, label %TBB, label %Tail
+
+TBB:
+ %cmp = icmp eq i32 %v, 1
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_eq_untaken
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_eq_untaken(i32* %a, i32 %v) {
+Header:
+ %tobool1 = icmp ne i32* %a, null
+ br i1 %tobool1, label %TBB, label %Tail
+
+TBB:
+ %cmp = icmp eq i32 %v, 1
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_ne_untaken
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_ne_untaken(i32* %a, i32 %v) {
+Header:
+ %tobool1 = icmp ne i32* %a, null
+ br i1 %tobool1, label %TBB, label %Tail
+
+TBB:
+ %cmp = icmp ne i32 %v, 1
+ br i1 %cmp, label %End, label %Tail
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_nonconst_const_phi
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_nonconst_const_phi(i32* %a, i32* %b, i32 %v) {
+Header:
+ %tobool1 = icmp eq i32* %a, %b
+ br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+ %cmp = icmp eq i32 %v, 1
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_nonconst_nonconst_phi
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 %v, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_nonconst_nonconst_phi(i32* %a, i32* %b, i32 %v, i32 %v2) {
+Header:
+ %tobool1 = icmp eq i32* %a, %b
+ br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+ %cmp = icmp eq i32 %v, %v2
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_nonconst_nonconst_phi_noncost
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_nonconst_nonconst_phi_noncost(i32* %a, i32* %b, i32 %v, i32 %v2) {
+Header:
+ %tobool1 = icmp eq i32* %a, %b
+ br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+ %cmp = icmp eq i32 %v, %v2
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[%v,%Header], [%v2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_fisrtnonphi
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_fisrtnonphi(i32* %a, i32 %v) {
+Header:
+ %tobool1 = icmp eq i32* %a, null
+ br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+ %cmp = icmp eq i32 %v, 1
+ br i1 %cmp, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ store i32 %v, i32* %a
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_3preds_constphi
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_3preds_constphi(i32* %a, i32 %v, i1 %c1, i1 %c2, i1 %c3) {
+Header:
+ br i1 %c1, label %Tail, label %TBB1
+
+TBB1:
+ br i1 %c2, label %Tail, label %TBB2
+
+TBB2:
+ br i1 %c3, label %Tail, label %End
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB1], [3, %TBB2]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+;CHECK-LABEL: @test_indirectbr_phi
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_indirectbr_phi(i8* %address, i32* %a, i32* %b, i32 %v) {
+Header:
+ %indirect.goto.dest = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address
+ indirectbr i8* %indirect.goto.dest, [label %TBB, label %Tail]
+
+TBB:
+ %indirect.goto.dest2 = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address
+ indirectbr i8* %indirect.goto.dest2, [label %Tail, label %End]
+
+Tail:
+ %p = phi i32[1,%Header], [2, %TBB]
+ %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+ ret i32 %r
+
+End:
+ ret i32 %v
+}
+
+define i32 @callee(i32* %a, i32 %v, i32 %p) {
+entry:
+ %c = icmp ne i32* %a, null
+ br i1 %c, label %BB1, label %BB2
+
+BB1:
+ call void @dummy(i32* %a, i32 %p)
+ br label %End
+
+BB2:
+ call void @dummy2(i32 %v, i32 %p)
+ br label %End
+
+End:
+ ret i32 %p
+}
+
+declare void @dummy(i32*, i32)
+declare void @dummy2(i32, i32)
diff --git a/test/Transforms/CallSiteSplitting/callsite-split.ll b/test/Transforms/CallSiteSplitting/callsite-split.ll
new file mode 100644
index 00000000000..419fa738563
--- /dev/null
+++ b/test/Transforms/CallSiteSplitting/callsite-split.ll
@@ -0,0 +1,119 @@
+; RUN: opt < %s -callsite-splitting -inline -instcombine -jump-threading -S | FileCheck %s
+; RUN: opt < %s -passes='function(callsite-splitting),cgscc(inline),function(instcombine,jump-threading)' -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linaro-linux-gnueabi"
+
+%struct.bitmap = type { i32, %struct.bitmap* }
+
+;CHECK-LABEL: @caller
+;CHECK-LABEL: NextCond:
+;CHECK: br {{.*}} label %callee.exit
+;CHECK-LABEL: CallSiteBB.predBB1.split:
+;CHECK: call void @callee(%struct.bitmap* null, %struct.bitmap* null, %struct.bitmap* %b_elt, i1 false)
+;CHECK-LABEL: callee.exit:
+;CHECK: call void @dummy2(%struct.bitmap* %a_elt)
+
+define void @caller(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt) {
+entry:
+ br label %Top
+
+Top:
+ %tobool1 = icmp eq %struct.bitmap* %a_elt, null
+ br i1 %tobool1, label %CallSiteBB, label %NextCond
+
+NextCond:
+ %cmp = icmp ne %struct.bitmap* %b_elt, null
+ br i1 %cmp, label %CallSiteBB, label %End
+
+CallSiteBB:
+ %p = phi i1 [0, %Top], [%c, %NextCond]
+ call void @callee(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %p)
+ br label %End
+
+End:
+ ret void
+}
+
+define void @callee(%struct.bitmap* %dst_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %c) {
+entry:
+ %tobool = icmp ne %struct.bitmap* %a_elt, null
+ %tobool1 = icmp ne %struct.bitmap* %b_elt, null
+ %or.cond = and i1 %tobool, %tobool1
+ br i1 %or.cond, label %Cond, label %Big
+
+Cond:
+ %cmp = icmp eq %struct.bitmap* %dst_elt, %a_elt
+ br i1 %cmp, label %Small, label %Big
+
+Small:
+ call void @dummy2(%struct.bitmap* %a_elt)
+ br label %End
+
+Big:
+ call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+ call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+ call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+ call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+ call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+ call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+ call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+ br label %End
+
+End:
+ ret void
+}
+
+declare void @dummy2(%struct.bitmap*)
+declare void @dummy1(%struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*)
+
+
+;CHECK-LABEL: @caller2
+;CHECK-LABEL: CallSiteBB.predBB1.split:
+;CHECK: call void @dummy4()
+;CHECK-LABEL: CallSiteBB.predBB2.split:
+;CHECK: call void @dummy3()
+;CheCK-LABEL: CallSiteBB:
+;CHECK: %phi.call = phi i1 [ false, %CallSiteBB.predBB1.split ], [ true, %CallSiteBB.predBB2.split ]
+;CHECK: call void @foo(i1 %phi.call)
+define void @caller2(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, %struct.bitmap* %c_elt) {
+entry:
+ br label %Top
+
+Top:
+ %tobool1 = icmp eq %struct.bitmap* %a_elt, %b_elt
+ br i1 %tobool1, label %CallSiteBB, label %NextCond
+
+NextCond:
+ %cmp = icmp ne %struct.bitmap* %b_elt, %c_elt
+ br i1 %cmp, label %CallSiteBB, label %End
+
+CallSiteBB:
+ %phi = phi i1 [0, %Top],[1, %NextCond]
+ %u = call i1 @callee2(i1 %phi)
+ call void @foo(i1 %u)
+ br label %End
+
+End:
+ ret void
+}
+
+define i1 @callee2(i1 %b) {
+entry:
+ br i1 %b, label %BB1, label %BB2
+
+BB1:
+ call void @dummy3()
+ br label %End
+
+BB2:
+ call void @dummy4()
+ br label %End
+
+End:
+ ret i1 %b
+}
+
+declare void @dummy3()
+declare void @dummy4()
+declare void @foo(i1)
diff --git a/test/Transforms/CodeExtractor/PartialInlineNoInline.ll b/test/Transforms/CodeExtractor/PartialInlineNoInline.ll
new file mode 100644
index 00000000000..6c0b83298d2
--- /dev/null
+++ b/test/Transforms/CodeExtractor/PartialInlineNoInline.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -partial-inliner -S -stats -pass-remarks=partial-inlining 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -S -stats -pass-remarks=partial-inlining 2>&1 | FileCheck %s
+
+@stat = external global i32, align 4
+
+define i32 @inline_fail(i32 %count, ...) {
+entry:
+ %vargs = alloca i8*, align 8
+ %vargs1 = bitcast i8** %vargs to i8*
+ call void @llvm.va_start(i8* %vargs1)
+ %stat1 = load i32, i32* @stat, align 4
+ %cmp = icmp slt i32 %stat1, 0
+ br i1 %cmp, label %bb2, label %bb1
+
+bb1: ; preds = %entry
+ %vg1 = add nsw i32 %stat1, 1
+ store i32 %vg1, i32* @stat, align 4
+ %va1 = va_arg i8** %vargs, i32
+ call void @foo(i32 %count, i32 %va1) #2
+ br label %bb2
+
+bb2: ; preds = %bb1, %entry
+ %res = phi i32 [ 1, %bb1 ], [ 0, %entry ]
+ call void @llvm.va_end(i8* %vargs1)
+ ret i32 %res
+}
+
+define i32 @caller(i32 %arg) {
+bb:
+ %res = tail call i32 (i32, ...) @inline_fail(i32 %arg, i32 %arg)
+ ret i32 %res
+}
+
+declare void @foo(i32, i32)
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_end(i8*)
+
+; Check that no remarks have been emitted, inline_fail has not been partial
+; inlined, no code has been extracted and the partial-inlining counter
+; has not been incremented.
+
+; CHECK-NOT: remark
+; CHECK: tail call i32 (i32, ...) @inline_fail(i32 %arg, i32 %arg)
+; CHECK-NOT: inline_fail.1_bb1
+; CHECK-NOT: partial-inlining
diff --git a/test/Transforms/CodeGenPrepare/ARM/sink-addrmode.ll b/test/Transforms/CodeGenPrepare/ARM/sink-addrmode.ll
new file mode 100644
index 00000000000..06a513543c4
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/ARM/sink-addrmode.ll
@@ -0,0 +1,18 @@
+; RUN: opt -S -codegenprepare -mtriple=thumbv7m -disable-complex-addr-modes=false -addr-sink-new-select=true < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; Select between two geps with different base, same constant offset
+define void @test_select_twogep_base(i32* %ptr1, i32* %ptr2, i32 %value) {
+; CHECK-LABEL: @test_select_twogep_base
+; CHECK-NOT: select i1 %cmp, i32* %gep1, i32* %gep2
+; CHECK: select i1 %cmp, i32* %ptr1, i32* %ptr2
+entry:
+ %cmp = icmp sgt i32 %value, 0
+ %gep1 = getelementptr inbounds i32, i32* %ptr1, i32 1
+ %gep2 = getelementptr inbounds i32, i32* %ptr2, i32 1
+ %select = select i1 %cmp, i32* %gep1, i32* %gep2
+ store i32 %value, i32* %select, align 4
+ ret void
+}
+
diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll
new file mode 100644
index 00000000000..2bacbdd7f40
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll
@@ -0,0 +1,475 @@
+; RUN: opt -S -codegenprepare -disable-complex-addr-modes=false -addr-sink-new-phis=true -addr-sink-new-select=true %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-YES
+; RUN: opt -S -codegenprepare -disable-complex-addr-modes=false -addr-sink-new-phis=false -addr-sink-new-select=true %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NO
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Can we sink for different base if there is no phi for base?
+define i32 @test1(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test1
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO-NEXT: load
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ %v = load i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; Can we sink for different base if there is phi for base?
+define i32 @test2(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test2
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK: getelementptr i8, {{.+}} 40
+ %b = phi i64* [%b1, %entry], [%b2, %if.then]
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ %v = load i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; Can we sink for different base if there is phi for base but not valid one?
+define i32 @test3(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test3
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO: phi
+; CHECK-NO-NEXT: load
+ %b = phi i64* [%b2, %entry], [%b1, %if.then]
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ %v = load i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; Can we sink for different base if both addresses are in the same block?
+define i32 @test4(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test4
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO-NEXT: load
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ %v = load i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; Can we sink for different base if there is phi for base?
+; Both addresses are in the same block.
+define i32 @test5(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test5
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ br label %fallthrough
+
+fallthrough:
+; CHECK: getelementptr i8, {{.+}} 40
+ %b = phi i64* [%b1, %entry], [%b2, %if.then]
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ %v = load i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; Can we sink for different base if there is phi for base but not valid one?
+; Both addresses are in the same block.
+define i32 @test6(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test6
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: load
+ %b = phi i64* [%b2, %entry], [%b1, %if.then]
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ %v = load i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; case with a loop. No phi node.
+define i32 @test7(i32 %N, i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test7
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK-YES: sunk_phi
+ %iv = phi i32 [0, %entry], [%iv.inc, %fallthrough]
+ %c3 = phi i32* [%c1, %entry], [%c, %fallthrough]
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO-NEXT: load
+ %c = phi i32* [%c3, %loop], [%c2, %if.then]
+ %v = load volatile i32, i32* %c, align 4
+ %iv.inc = add i32 %iv, 1
+ %cmp = icmp slt i32 %iv.inc, %N
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+ ret i32 %v
+}
+
+; case with a loop. There is phi node.
+define i32 @test8(i32 %N, i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test8
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br label %loop
+
+loop:
+ %iv = phi i32 [0, %entry], [%iv.inc, %fallthrough]
+ %c3 = phi i32* [%c1, %entry], [%c, %fallthrough]
+ %b3 = phi i64* [%b1, %entry], [%b, %fallthrough]
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK: getelementptr i8, {{.+}} 40
+ %c = phi i32* [%c3, %loop], [%c2, %if.then]
+ %b = phi i64* [%b3, %loop], [%b2, %if.then]
+ %v = load volatile i32, i32* %c, align 4
+ %iv.inc = add i32 %iv, 1
+ %cmp = icmp slt i32 %iv.inc, %N
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+ ret i32 %v
+}
+
+; case with a loop. There is phi node but it does not fit.
+define i32 @test9(i32 %N, i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test9
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK-YES: sunk_phi
+ %iv = phi i32 [0, %entry], [%iv.inc, %fallthrough]
+ %c3 = phi i32* [%c1, %entry], [%c, %fallthrough]
+ %b3 = phi i64* [%b1, %entry], [%b2, %fallthrough]
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: load
+ %c = phi i32* [%c3, %loop], [%c2, %if.then]
+ %b = phi i64* [%b3, %loop], [%b2, %if.then]
+ %v = load volatile i32, i32* %c, align 4
+ %iv.inc = add i32 %iv, 1
+ %cmp = icmp slt i32 %iv.inc, %N
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+ ret i32 %v
+}
+
+; Case through a loop. No phi node.
+define i32 @test10(i32 %N, i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test10
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: br
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ br label %loop
+
+loop:
+ %iv = phi i32 [0, %fallthrough], [%iv.inc, %loop]
+ %iv.inc = add i32 %iv, 1
+ %cmp = icmp slt i32 %iv.inc, %N
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+; CHECK-YES: sunkaddr
+ %v = load volatile i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; Case through a loop. There is a phi.
+define i32 @test11(i32 %N, i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test11
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK: phi
+; CHECK: phi
+; CHECK: br
+ %c = phi i32* [%c1, %entry], [%c2, %if.then]
+ %b = phi i64* [%b1, %entry], [%b2, %if.then]
+ br label %loop
+
+loop:
+ %iv = phi i32 [0, %fallthrough], [%iv.inc, %loop]
+ %iv.inc = add i32 %iv, 1
+ %cmp = icmp slt i32 %iv.inc, %N
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+; CHECK: sunkaddr
+ %v = load volatile i32, i32* %c, align 4
+ ret i32 %v
+}
+
+; Complex case with address value from previous iteration.
+define i32 @test12(i32 %N, i1 %cond, i64* %b1, i64* %b2, i64* %b3) {
+; CHECK-LABEL: @test12
+entry:
+ %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %c1 = bitcast i64* %a1 to i32*
+ br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK-YES: sunk_phi
+; CHECK-NO: phi
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: br
+ %iv = phi i32 [0, %entry], [%iv.inc, %backedge]
+ %c3 = phi i32* [%c1, %entry], [%c, %backedge]
+ %b4 = phi i64* [%b1, %entry], [%b5, %backedge]
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %c2 = bitcast i64* %a2 to i32*
+ br label %fallthrough
+
+fallthrough:
+; CHECK-LABEL: fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO: phi
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: load
+ %c = phi i32* [%c3, %loop], [%c2, %if.then]
+ %b6 = phi i64* [%b4, %loop], [%b2, %if.then]
+ %v = load volatile i32, i32* %c, align 4
+ %a4 = getelementptr inbounds i64, i64* %b4, i64 5
+ %c4 = bitcast i64* %a4 to i32*
+ %cmp = icmp slt i32 %iv, 20
+ br i1 %cmp, label %backedge, label %if.then.2
+
+if.then.2:
+ br label %backedge
+
+backedge:
+ %b5 = phi i64* [%b4, %fallthrough], [%b6, %if.then.2]
+ %iv.inc = add i32 %iv, 1
+ %cmp2 = icmp slt i32 %iv.inc, %N
+ br i1 %cmp2, label %loop, label %exit
+
+exit:
+ ret i32 %v
+}
+
+%struct.S = type {i32, i32}
+; Case with index
+define i32 @test13(i1 %cond, %struct.S* %b1, %struct.S* %b2, i64 %Index) {
+; CHECK-LABEL: @test13
+entry:
+ %a1 = getelementptr inbounds %struct.S, %struct.S* %b1, i64 %Index, i32 1
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %i2 = mul i64 %Index, 2
+ %a2 = getelementptr inbounds %struct.S, %struct.S* %b2, i64 %Index, i32 1
+ br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: load
+ %a = phi i32* [%a1, %entry], [%a2, %if.then]
+ %v = load i32, i32* %a, align 4
+ ret i32 %v
+}
+
+; Select of Select case.
+define i64 @test14(i1 %c1, i1 %c2, i64* %b1, i64* %b2, i64* %b3) {
+; CHECK-LABEL: @test14
+entry:
+; CHECK-LABEL: entry:
+ %g1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %g2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %g3 = getelementptr inbounds i64, i64* %b3, i64 5
+ %s1 = select i1 %c1, i64* %g1, i64* %g2
+ %s2 = select i1 %c2, i64* %s1, i64* %g3
+; CHECK: sunkaddr
+ %v = load i64 , i64* %s2, align 8
+ ret i64 %v
+}
+
+; Select of Phi case.
+define i64 @test15(i1 %c1, i1 %c2, i64* %b1, i64* %b2, i64* %b3) {
+; CHECK-LABEL: @test15
+entry:
+ %g1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %g2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %g3 = getelementptr inbounds i64, i64* %b3, i64 5
+ br i1 %c1, label %if.then, label %fallthrough
+
+if.then:
+ br label %fallthrough
+
+fallthrough:
+; CHECK-LABEL: fallthrough:
+ %p1 = phi i64* [%g1, %entry], [%g2, %if.then]
+ %s1 = select i1 %c2, i64* %p1, i64* %g3
+; CHECK-YES: sunkaddr
+; CHECK-NO: phi
+; CHECK-NO-NEXT: select
+; CHECK-NO-NEXT: load
+ %v = load i64 , i64* %s1, align 8
+ ret i64 %v
+}
+
+; Select of Phi case. Phi exists
+define i64 @test16(i1 %c1, i1 %c2, i64* %b1, i64* %b2, i64* %b3) {
+; CHECK-LABEL: @test16
+entry:
+ %g1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %g2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %g3 = getelementptr inbounds i64, i64* %b3, i64 5
+ br i1 %c1, label %if.then, label %fallthrough
+
+if.then:
+ br label %fallthrough
+
+fallthrough:
+; CHECK-LABEL: fallthrough:
+ %p = phi i64* [%b1, %entry], [%b2, %if.then]
+ %p1 = phi i64* [%g1, %entry], [%g2, %if.then]
+ %s1 = select i1 %c2, i64* %p1, i64* %g3
+; CHECK: sunkaddr
+ %v = load i64 , i64* %s1, align 8
+ ret i64 %v
+}
+
+; Phi of Select case.
+define i64 @test17(i1 %c1, i1 %c2, i64* %b1, i64* %b2, i64* %b3) {
+; CHECK-LABEL: @test17
+entry:
+ %g1 = getelementptr inbounds i64, i64* %b1, i64 5
+ %g2 = getelementptr inbounds i64, i64* %b2, i64 5
+ %g3 = getelementptr inbounds i64, i64* %b3, i64 5
+ %s1 = select i1 %c2, i64* %g1, i64* %g2
+ br i1 %c1, label %if.then, label %fallthrough
+
+if.then:
+ br label %fallthrough
+
+fallthrough:
+; CHECK-LABEL: fallthrough:
+ %p1 = phi i64* [%s1, %entry], [%g3, %if.then]
+; CHECK-YES: sunkaddr
+; CHECK-NO: phi
+; CHECK-NO-NEXT: load
+ %v = load i64 , i64* %p1, align 8
+ ret i64 %v
+}
diff --git a/test/LibDriver/lit.local.cfg b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg
index e71f3cc4c41..e71f3cc4c41 100644
--- a/test/LibDriver/lit.local.cfg
+++ b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg
diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/ExpandMemCmp/X86/memcmp.ll
index a4f635c956d..1abfb20f369 100644
--- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll
+++ b/test/Transforms/ExpandMemCmp/X86/memcmp.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: opt -S -expandmemcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: opt -S -expandmemcmp -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64
declare i32 @memcmp(i8* nocapture, i8* nocapture, i64)
@@ -23,30 +23,33 @@ define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp3(
-; ALL-NEXT: loadbb:
-; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16*
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16*
-; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]]
-; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]]
-; ALL-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
-; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; ALL-NEXT: [[TMP6:%.*]] = icmp eq i16 [[TMP4]], [[TMP5]]
-; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; ALL-NEXT: br label [[LOADBB:%.*]]
; ALL: res_block:
-; ALL-NEXT: [[TMP7:%.*]] = icmp ult i16 [[TMP4]], [[TMP5]]
-; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i16 [ [[TMP7:%.*]], [[LOADBB]] ]
+; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i16 [ [[TMP8:%.*]], [[LOADBB]] ]
+; ALL-NEXT: [[TMP1:%.*]] = icmp ult i16 [[PHI_SRC1]], [[PHI_SRC2]]
+; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; ALL-NEXT: br label [[ENDBLOCK:%.*]]
+; ALL: loadbb:
+; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; ALL-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]]
+; ALL-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP4]]
+; ALL-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
+; ALL-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
+; ALL-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
+; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
; ALL: loadbb1:
-; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 2
-; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 2
-; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]]
+; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 2
+; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 2
; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]]
-; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]]
; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; ALL-NEXT: br label [[ENDBLOCK]]
; ALL: endblock:
-; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; ALL-NEXT: ret i32 [[PHI_RES]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
@@ -74,30 +77,33 @@ define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp5(
-; ALL-NEXT: loadbb:
-; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
-; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
-; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; ALL-NEXT: br label [[LOADBB:%.*]]
; ALL: res_block:
-; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]]
-; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ]
+; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ]
+; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; ALL-NEXT: br label [[ENDBLOCK:%.*]]
+; ALL: loadbb:
+; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]]
+; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]]
+; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
; ALL: loadbb1:
-; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 4
-; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 4
-; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]]
+; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 4
+; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 4
; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]]
-; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]]
; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; ALL-NEXT: br label [[ENDBLOCK]]
; ALL: endblock:
-; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; ALL-NEXT: ret i32 [[PHI_RES]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
@@ -106,36 +112,37 @@ define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp6(
-; ALL-NEXT: loadbb:
-; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
-; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
-; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; ALL-NEXT: br label [[LOADBB:%.*]]
; ALL: res_block:
-; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
-; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
-; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
-; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
+; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; ALL-NEXT: br label [[ENDBLOCK:%.*]]
+; ALL: loadbb:
+; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]]
+; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]]
+; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
; ALL: loadbb1:
-; ALL-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
-; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
-; ALL-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2
+; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16*
+; ALL-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16*
; ALL-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2
-; ALL-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]]
+; ALL-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2
; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]]
-; ALL-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
+; ALL-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]]
; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; ALL-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i32
+; ALL-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
; ALL-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32
-; ALL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]]
-; ALL-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; ALL-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i32
+; ALL-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]]
+; ALL-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
; ALL: endblock:
-; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; ALL-NEXT: ret i32 [[PHI_RES]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
@@ -153,34 +160,35 @@ define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-LABEL: @cmp8(
-; X32-NEXT: loadbb:
-; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
-; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
-; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32-NEXT: br label [[LOADBB:%.*]]
; X32: res_block:
-; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
-; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ]
-; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
-; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
+; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X32-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; X32-NEXT: br label [[ENDBLOCK:%.*]]
+; X32: loadbb:
+; X32-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]]
+; X32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]]
+; X32-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; X32-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
; X32: loadbb1:
-; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
-; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
-; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32*
; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
-; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]]
-; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]]
; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
-; X32-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
+; X32-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
+; X32-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
; X32: endblock:
-; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; X32-NEXT: ret i32 [[PHI_RES]]
;
; X64-LABEL: @cmp8(
@@ -207,30 +215,33 @@ define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-NEXT: ret i32 [[CALL]]
;
; X64-LABEL: @cmp9(
-; X64-NEXT: loadbb:
-; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
-; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
-; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]]
-; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ]
+; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ]
+; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; X64-NEXT: br label [[ENDBLOCK:%.*]]
+; X64: loadbb:
+; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
; X64: loadbb1:
-; X64-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 8
-; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 8
-; X64-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]]
+; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8
+; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 8
; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]]
-; X64-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]]
; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X64-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; X64-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; X64-NEXT: br label [[ENDBLOCK]]
; X64: endblock:
-; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; X64-NEXT: ret i32 [[PHI_RES]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
@@ -243,36 +254,37 @@ define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-NEXT: ret i32 [[CALL]]
;
; X64-LABEL: @cmp10(
-; X64-NEXT: loadbb:
-; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
-; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
-; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
-; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
-; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; X64-NEXT: br label [[ENDBLOCK:%.*]]
+; X64: loadbb:
+; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
; X64: loadbb1:
-; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
-; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
-; X64-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4
+; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16*
; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4
-; X64-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]]
+; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 4
; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]]
-; X64-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
+; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]]
; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; X64-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i64
+; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64
-; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
-; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64
+; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]
+; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
; X64: endblock:
-; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; X64-NEXT: ret i32 [[PHI_RES]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
@@ -294,36 +306,37 @@ define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-NEXT: ret i32 [[CALL]]
;
; X64-LABEL: @cmp12(
-; X64-NEXT: loadbb:
-; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
-; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
-; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
-; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
-; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; X64-NEXT: br label [[ENDBLOCK:%.*]]
+; X64: loadbb:
+; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
; X64: loadbb1:
-; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
-; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
-; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2
+; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32*
; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2
-; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X64-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 2
; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]]
-; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X64-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]]
; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64
+; X64-NEXT: [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64
-; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
-; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT: [[TMP19]] = zext i32 [[TMP17]] to i64
+; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]
+; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
; X64: endblock:
-; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; X64-NEXT: ret i32 [[PHI_RES]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
@@ -363,34 +376,35 @@ define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-NEXT: ret i32 [[CALL]]
;
; X64-LABEL: @cmp16(
-; X64-NEXT: loadbb:
-; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
-; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
-; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
-; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ]
-; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; X64-NEXT: br label [[ENDBLOCK:%.*]]
+; X64: loadbb:
+; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
; X64: loadbb1:
-; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i64*
-; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i64*
-; X64-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 1
+; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i64*
+; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i64*
; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1
-; X64-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP11]]
+; X64-NEXT: [[TMP13:%.*]] = getelementptr i64, i64* [[TMP11]], i64 1
; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]]
-; X64-NEXT: [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]])
+; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]]
; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP16]]
-; X64-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
+; X64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
+; X64-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
; X64: endblock:
-; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; X64-NEXT: ret i32 [[PHI_RES]]
;
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
@@ -417,22 +431,23 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq3(
-; ALL-NEXT: loadbb:
-; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16*
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16*
-; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]]
-; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]]
-; ALL-NEXT: [[TMP4:%.*]] = icmp ne i16 [[TMP2]], [[TMP3]]
-; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL-NEXT: br label [[LOADBB:%.*]]
; ALL: res_block:
; ALL-NEXT: br label [[ENDBLOCK:%.*]]
+; ALL: loadbb:
+; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]]
+; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]]
+; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
+; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
; ALL: loadbb1:
-; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 2
-; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 2
-; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]]
+; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 2
+; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 2
; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]]
-; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
-; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]]
+; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
+; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
; ALL: endblock:
; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -465,22 +480,23 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq5(
-; ALL-NEXT: loadbb:
-; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
-; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL-NEXT: br label [[LOADBB:%.*]]
; ALL: res_block:
; ALL-NEXT: br label [[ENDBLOCK:%.*]]
+; ALL: loadbb:
+; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]]
+; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
+; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
; ALL: loadbb1:
-; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 4
-; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 4
-; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]]
+; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4
+; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 4
; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]]
-; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
-; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]]
+; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
+; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
; ALL: endblock:
; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -495,24 +511,25 @@ define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; ALL-LABEL: @cmp_eq6(
-; ALL-NEXT: loadbb:
-; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
-; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL-NEXT: br label [[LOADBB:%.*]]
; ALL: res_block:
; ALL-NEXT: br label [[ENDBLOCK:%.*]]
+; ALL: loadbb:
+; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]]
+; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
+; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
; ALL: loadbb1:
-; ALL-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16*
-; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16*
-; ALL-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2
+; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16*
+; ALL-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16*
; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2
-; ALL-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]]
+; ALL-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2
; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]]
-; ALL-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]]
-; ALL-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]]
+; ALL-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
+; ALL-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
; ALL: endblock:
; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -540,24 +557,25 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) {
define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-LABEL: @cmp_eq8(
-; X32-NEXT: loadbb:
-; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
-; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32-NEXT: br label [[LOADBB:%.*]]
; X32: res_block:
; X32-NEXT: br label [[ENDBLOCK:%.*]]
+; X32: loadbb:
+; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]]
+; X32-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
+; X32-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
; X32: loadbb1:
-; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
-; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
-; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32*
; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
-; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 1
; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]]
-; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
-; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]]
+; X32-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
+; X32-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
; X32: endblock:
; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -589,22 +607,23 @@ define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-NEXT: ret i32 [[CONV]]
;
; X64-LABEL: @cmp_eq9(
-; X64-NEXT: loadbb:
-; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
-; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
; X64-NEXT: br label [[ENDBLOCK:%.*]]
+; X64: loadbb:
+; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]]
+; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
+; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
; X64: loadbb1:
-; X64-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 8
-; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 8
-; X64-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]]
+; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8
+; X64-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 8
; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]]
-; X64-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
-; X64-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]]
+; X64-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
+; X64-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
; X64: endblock:
; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -625,24 +644,25 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-NEXT: ret i32 [[CONV]]
;
; X64-LABEL: @cmp_eq10(
-; X64-NEXT: loadbb:
-; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
-; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
; X64-NEXT: br label [[ENDBLOCK:%.*]]
+; X64: loadbb:
+; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]]
+; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
+; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
; X64: loadbb1:
-; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16*
-; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16*
-; X64-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4
+; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16*
; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4
-; X64-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]]
+; X64-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4
; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]]
-; X64-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]]
-; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]]
+; X64-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
+; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
; X64: endblock:
; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -676,24 +696,25 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; X32-NEXT: ret i32 [[CONV]]
;
; X64-LABEL: @cmp_eq12(
-; X64-NEXT: loadbb:
-; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
-; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
; X64-NEXT: br label [[ENDBLOCK:%.*]]
+; X64: loadbb:
+; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]]
+; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
+; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
; X64: loadbb1:
-; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
-; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
-; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2
+; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32*
; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2
-; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X64-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2
; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]]
-; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
-; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]]
+; X64-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
+; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
; X64: endblock:
; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
diff --git a/test/Transforms/IRCE/add-metadata-pre-post-loops.ll b/test/Transforms/IRCE/add-metadata-pre-post-loops.ll
index 488d4b479ba..0225af903ef 100644
--- a/test/Transforms/IRCE/add-metadata-pre-post-loops.ll
+++ b/test/Transforms/IRCE/add-metadata-pre-post-loops.ll
@@ -38,7 +38,7 @@ exit: ; preds = %in.bounds, %entry
define void @single_access_with_preloop(i32 *%arr, i32 *%a_len_ptr, i32 %n, i32 %offset) {
; CHECK-LABEL: @single_access_with_preloop(
; CHECK-LABEL: in.bounds.preloop
-; CHECK: br i1 %14, label %loop.preloop, label %preloop.exit.selector, !llvm.loop !8, !irce.loop.clone !7
+; CHECK: br i1 [[COND:%[^ ]+]], label %loop.preloop, label %preloop.exit.selector, !llvm.loop !8, !irce.loop.clone !7
; CHECK-LABEL: in.bounds.postloop
; CHECK: br i1 %next.postloop, label %loop.postloop, label %exit.loopexit.loopexit, !llvm.loop !9, !irce.loop.clone !7
entry:
diff --git a/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll b/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll
new file mode 100644
index 00000000000..dc6aae8d8aa
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll
@@ -0,0 +1,71 @@
+; RUN: opt %s -indvars -S -o - | FileCheck %s
+source_filename = "/Data/llvm/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.status = type { i32, i8* }
+
+@status = internal unnamed_addr global [32 x %struct.status] zeroinitializer, align 16, !dbg !0
+
+define void @f0() local_unnamed_addr !dbg !20 {
+entry:
+ tail call void @llvm.dbg.value(metadata i32 0, metadata !23, metadata !DIExpression()), !dbg !24
+ br label %for.cond, !dbg !24
+
+for.cond: ; preds = %for.body, %entry
+ ; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+ ; CHECK: call void @llvm.dbg.value(metadata i64 %indvars.iv, metadata !23, metadata !DIExpression()), !dbg !24
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ tail call void @llvm.dbg.value(metadata i32 %i.0, metadata !23, metadata !DIExpression()), !dbg !24
+ %cmp = icmp slt i32 %i.0, 32, !dbg !24
+ br i1 %cmp, label %for.body, label %for.end, !dbg !24
+
+for.body: ; preds = %for.cond
+ %idxprom = sext i32 %i.0 to i64, !dbg !24
+ %value = getelementptr inbounds [32 x %struct.status], [32 x %struct.status]* @status, i64 0, i64 %idxprom, i32 0, !dbg !24
+ store i32 42, i32* %value, align 16, !dbg !24
+ tail call void @use(i32 %i.0), !dbg !24
+ %inc = add nsw i32 %i.0, 1, !dbg !24
+ tail call void @llvm.dbg.value(metadata i32 %inc, metadata !23, metadata !DIExpression()), !dbg !24
+ br label %for.cond, !dbg !24
+
+for.end: ; preds = %for.cond
+ ret void, !dbg !24
+}
+
+declare void @use(i32)
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #0
+
+attributes #0 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!16, !17, !18}
+!llvm.ident = !{!19}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "status", scope: !2, file: !3, line: 5, type: !6, isLocal: true, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 6.0.0 (trunk 316001) (llvm/trunk 316171)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "x.c", directory: "/home/davide/work/llvm/build-release/bin")
+!4 = !{}
+!5 = !{!0}
+!6 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 4096, elements: !14)
+!7 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "status", file: !3, line: 2, size: 128, elements: !8)
+!8 = !{!9, !11}
+!9 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !7, file: !3, line: 3, baseType: !10, size: 32)
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DIDerivedType(tag: DW_TAG_member, name: "p", scope: !7, file: !3, line: 4, baseType: !12, size: 64, offset: 64)
+!12 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !13, size: 64)
+!13 = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char)
+!14 = !{!15}
+!15 = !DISubrange(count: 32)
+!16 = !{i32 2, !"Dwarf Version", i32 4}
+!17 = !{i32 2, !"Debug Info Version", i32 3}
+!18 = !{i32 1, !"wchar_size", i32 4}
+!19 = !{!"clang version 6.0.0 (trunk 316001) (llvm/trunk 316171)"}
+!20 = distinct !DISubprogram(name: "f0", scope: !3, file: !3, line: 6, type: !21, isLocal: false, isDefinition: true, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: true, unit: !2, variables: !22)
+!21 = !DISubroutineType(types: !4)
+!22 = !{!23}
+!23 = !DILocalVariable(name: "i", scope: !20, file: !3, line: 8, type: !10)
+!24 = !DILocation(line: 9, scope: !20)
diff --git a/test/Transforms/InstCombine/debuginfo_add.ll b/test/Transforms/InstCombine/debuginfo_add.ll
new file mode 100644
index 00000000000..0d194cc65c7
--- /dev/null
+++ b/test/Transforms/InstCombine/debuginfo_add.ll
@@ -0,0 +1,108 @@
+; RUN: opt -instcombine %s -o - -S | FileCheck %s
+; typedef struct v *v_t;
+; struct v {
+; unsigned long long p;
+; };
+;
+; void f(v_t object, unsigned long long *start) {
+; unsigned head_size;
+; unsigned long long orig_start;
+; unsigned long long offset;
+; orig_start = *start;
+; for (offset = orig_start - (unsigned long long)(1 << 12); head_size;
+; offset -= (unsigned long long)(1 << 12), head_size -= (1 << 12))
+; use(offset, (object));
+; }
+source_filename = "test.i"
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7s-apple-ios5.0.0"
+
+%struct.vm_object = type { i64 }
+
+; Function Attrs: nounwind ssp
+define void @f(%struct.vm_object* %object, i64* nocapture readonly %start) local_unnamed_addr #0 !dbg !11 {
+entry:
+ tail call void @llvm.dbg.value(metadata %struct.vm_object* %object, metadata !21, metadata !DIExpression()), !dbg !27
+ tail call void @llvm.dbg.value(metadata i64* %start, metadata !22, metadata !DIExpression()), !dbg !28
+ %0 = load i64, i64* %start, align 4, !dbg !29
+ tail call void @llvm.dbg.value(metadata i64 %0, metadata !25, metadata !DIExpression()), !dbg !30
+ %offset.08 = add i64 %0, -4096
+ tail call void @llvm.dbg.value(metadata i64 %offset.08, metadata !26, metadata !DIExpression()), !dbg !31
+ ; CHECK: call void @llvm.dbg.value(metadata i64 %0, metadata !26, metadata !DIExpression(DW_OP_constu, 4096, DW_OP_minus, DW_OP_stack_value)), !dbg !30
+ tail call void @llvm.dbg.value(metadata i32 undef, metadata !23, metadata !DIExpression()), !dbg !32
+ br i1 undef, label %for.end, label %for.body.lr.ph, !dbg !32
+
+for.body.lr.ph: ; preds = %entry
+ br label %for.body, !dbg !32
+
+for.body: ; preds = %for.body.lr.ph, %for.body
+ %offset.010 = phi i64 [ %offset.08, %for.body.lr.ph ], [ %offset.0, %for.body ]
+ %head_size.09 = phi i32 [ undef, %for.body.lr.ph ], [ %sub2, %for.body ]
+ tail call void @llvm.dbg.value(metadata i32 %head_size.09, metadata !23, metadata !DIExpression()), !dbg !31
+ %call = tail call i32 bitcast (i32 (...)* @use to i32 (i64, %struct.vm_object*)*)(i64 %offset.010, %struct.vm_object* %object) #3, !dbg !34
+ %sub2 = add i32 %head_size.09, -4096, !dbg !37
+ %offset.0 = add i64 %offset.010, -4096
+ tail call void @llvm.dbg.value(metadata i64 %offset.0, metadata !26, metadata !DIExpression()), !dbg !30
+ ; CHECK: call void @llvm.dbg.value(metadata i64 %offset.010, metadata !26, metadata !DIExpression(DW_OP_constu, 4096, DW_OP_minus, DW_OP_stack_value)), !dbg !29
+ tail call void @llvm.dbg.value(metadata i32 %sub2, metadata !23, metadata !DIExpression()), !dbg !31
+ %tobool = icmp eq i32 %sub2, 0, !dbg !32
+ br i1 %tobool, label %for.end, label %for.body, !dbg !32, !llvm.loop !38
+
+for.end: ; preds = %for.body, %entry
+ ret void, !dbg !40
+}
+
+declare i32 @use(...) local_unnamed_addr
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+attributes #0 = { nounwind ssp }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!5, !6, !7, !8, !9}
+!llvm.ident = !{!10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 (trunk 317434) (llvm/trunk 317437)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "test.i", directory: "/Data/radar/31209283")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!5 = !{i32 2, !"Dwarf Version", i32 2}
+!6 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = !{i32 1, !"wchar_size", i32 4}
+!8 = !{i32 1, !"min_enum_size", i32 4}
+!9 = !{i32 7, !"PIC Level", i32 2}
+!10 = !{!"clang version 6.0.0 (trunk 317434) (llvm/trunk 317437)"}
+!11 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 6, type: !12, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !20)
+!12 = !DISubroutineType(types: !13)
+!13 = !{null, !14, !19}
+!14 = !DIDerivedType(tag: DW_TAG_typedef, name: "v_t", file: !1, line: 1, baseType: !15)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 32)
+!16 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "v", file: !1, line: 2, size: 64, elements: !17)
+!17 = !{!18}
+!18 = !DIDerivedType(tag: DW_TAG_member, name: "p", scope: !16, file: !1, line: 3, baseType: !4, size: 64)
+!19 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 32)
+!20 = !{!21, !22, !23, !25, !26}
+!21 = !DILocalVariable(name: "object", arg: 1, scope: !11, file: !1, line: 6, type: !14)
+!22 = !DILocalVariable(name: "start", arg: 2, scope: !11, file: !1, line: 6, type: !19)
+!23 = !DILocalVariable(name: "head_size", scope: !11, file: !1, line: 7, type: !24)
+!24 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!25 = !DILocalVariable(name: "orig_start", scope: !11, file: !1, line: 8, type: !4)
+!26 = !DILocalVariable(name: "offset", scope: !11, file: !1, line: 9, type: !4)
+!27 = !DILocation(line: 6, column: 20, scope: !11)
+!28 = !DILocation(line: 6, column: 48, scope: !11)
+!29 = !DILocation(line: 8, column: 22, scope: !11)
+!30 = !DILocation(line: 7, column: 12, scope: !11)
+!31 = !DILocation(line: 10, column: 16, scope: !11)
+!32 = !DILocation(line: 11, column: 5, scope: !33)
+!33 = distinct !DILexicalBlock(scope: !11, file: !1, line: 11, column: 5)
+!34 = !DILocation(line: 13, column: 7, scope: !35)
+!35 = distinct !DILexicalBlock(scope: !36, file: !1, line: 12, column: 75)
+!36 = distinct !DILexicalBlock(scope: !33, file: !1, line: 11, column: 5)
+!37 = !DILocation(line: 12, column: 61, scope: !36)
+!38 = distinct !{!38, !32, !39}
+!39 = !DILocation(line: 14, column: 3, scope: !33)
+!40 = !DILocation(line: 15, column: 1, scope: !11)
diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll
index cbb3d614db2..ba52023e0db 100644
--- a/test/Transforms/InstCombine/shift.ll
+++ b/test/Transforms/InstCombine/shift.ll
@@ -1332,3 +1332,263 @@ define i7 @test65(i7 %a, i7 %b) {
%y = and i7 %x, 1 ; this extracts the lsb which should be 0 because we shifted an even number of bits and all even bits of the shift input are 0.
ret i7 %y
}
+
+define i32 @shl_select_add_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_add_true(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = add i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @shl_select_add_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_add_false(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = add i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @shl_select_and_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_and_true(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = and i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @shl_select_and_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_and_false(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = and i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @lshr_select_and_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_and_true(
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = and i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = lshr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @lshr_select_and_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_and_false(
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = and i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = lshr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @ashr_select_and_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_and_true(
+; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], -1073741821
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = and i32 %x, 2147483655
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = ashr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @ashr_select_and_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_and_false(
+; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], -1073741821
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = and i32 %x, 2147483655
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = ashr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @shl_select_or_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_or_true(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = or i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @shl_select_or_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_or_false(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = or i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @lshr_select_or_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_or_true(
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = or i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = lshr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @lshr_select_or_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_or_false(
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = or i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = lshr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @ashr_select_or_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_or_true(
+; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = or i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = ashr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @ashr_select_or_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_or_false(
+; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = or i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = ashr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @shl_select_xor_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_xor_true(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = xor i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @shl_select_xor_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_xor_false(
+; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = xor i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = shl i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @lshr_select_xor_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_xor_true(
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = xor i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = lshr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @lshr_select_xor_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_xor_false(
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = xor i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = lshr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @ashr_select_xor_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_xor_true(
+; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = xor i32 %x, 7
+ %2 = select i1 %cond, i32 %1, i32 %x
+ %3 = ashr i32 %2, 1
+ ret i32 %3
+}
+
+define i32 @ashr_select_xor_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_xor_false(
+; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT: ret i32 [[TMP3]]
+;
+ %1 = xor i32 %x, 7
+ %2 = select i1 %cond, i32 %x, i32 %1
+ %3 = ashr i32 %2, 1
+ ret i32 %3
+}
diff --git a/test/Transforms/LICM/sinking.ll b/test/Transforms/LICM/sinking.ll
index 6e9e8d4b7b6..b28eea0bc2a 100644
--- a/test/Transforms/LICM/sinking.ll
+++ b/test/Transforms/LICM/sinking.ll
@@ -392,6 +392,288 @@ lab60:
indirectbr i8* undef, [label %lab21, label %lab19]
}
-declare void @f(i32*)
+; Check if LICM can sink a sinkable instruction the exit blocks through
+; a non-trivially replacable PHI node.
+;
+; CHECK-LABEL: @test14
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+; CHECK-NOT: sub
+;
+; CHECK-LABEL: Out12.split.loop.exit:
+; CHECK: %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop ]
+; CHECK: %[[MUL:.*]] = mul i32 %N, %[[LCSSAPHI]]
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12.split.loop.exit1:
+; CHECK: %[[LCSSAPHI2:.*]] = phi i32 [ %N_addr.0.pn, %Loop ]
+; CHECK: %[[MUL2:.*]] = mul i32 %N, %[[LCSSAPHI2]]
+; CHECK: %[[SUB:.*]] = sub i32 %[[MUL2]], %N
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12:
+; CHECK: phi i32 [ %[[MUL]], %Out12.split.loop.exit ], [ %[[SUB]], %Out12.split.loop.exit1 ]
+define i32 @test14(i32 %N, i32 %N2, i1 %C) {
+Entry:
+ br label %Loop
+Loop:
+ %N_addr.0.pn = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ]
+ %sink.mul = mul i32 %N, %N_addr.0.pn
+ %sink.sub = sub i32 %sink.mul, %N
+ %dec = add i32 %N_addr.0.pn, -1
+ br i1 %C, label %ContLoop, label %Out12
+ContLoop:
+ %tmp.1 = icmp ne i32 %N_addr.0.pn, 1
+ br i1 %tmp.1, label %Loop, label %Out12
+Out12:
+ %tmp = phi i32 [%sink.mul, %ContLoop], [%sink.sub, %Loop]
+ ret i32 %tmp
+}
+
+; In this test, splitting predecessors is not really required because the
+; operations of sinkable instructions (sub and mul) are same. In this case, we
+; can sink the same sinkable operations and modify the PHI to pass the operands
+; to the shared operations. As of now, we split predecessors of non-trivially
+; replicalbe PHIs by default in LICM because all incoming edges of a
+; non-trivially replacable PHI in LCSSA is critical.
+;
+; CHECK-LABEL: @test15
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+; CHECK-NOT: sub
+;
+; CHECK-LABEL: Out12.split.loop.exit:
+; CHECK: %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop ]
+; CHECK: %[[MUL:.*]] = mul i32 %N, %[[LCSSAPHI]]
+; CHECK: %[[SUB:.*]] = sub i32 %[[MUL]], %N2
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12.split.loop.exit1:
+; CHECK: %[[LCSSAPHI2:.*]] = phi i32 [ %N_addr.0.pn, %Loop ]
+; CHECK: %[[MUL2:.*]] = mul i32 %N, %[[LCSSAPHI2]]
+; CHECK: %[[SUB2:.*]] = sub i32 %[[MUL2]], %N
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12:
+; CHECK: phi i32 [ %[[SUB]], %Out12.split.loop.exit ], [ %[[SUB2]], %Out12.split.loop.exit1 ]
+define i32 @test15(i32 %N, i32 %N2, i1 %C) {
+Entry:
+ br label %Loop
+Loop:
+ %N_addr.0.pn = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ]
+ %sink.mul = mul i32 %N, %N_addr.0.pn
+ %sink.sub = sub i32 %sink.mul, %N
+ %sink.sub2 = sub i32 %sink.mul, %N2
+ %dec = add i32 %N_addr.0.pn, -1
+ br i1 %C, label %ContLoop, label %Out12
+ContLoop:
+ %tmp.1 = icmp ne i32 %N_addr.0.pn, 1
+ br i1 %tmp.1, label %Loop, label %Out12
+Out12:
+ %tmp = phi i32 [%sink.sub2, %ContLoop], [%sink.sub, %Loop]
+ ret i32 %tmp
+}
+
+; Sink through a non-trivially replacable PHI node which use the same sinkable
+; instruction multiple times.
+;
+; CHECK-LABEL: @test16
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+;
+; CHECK-LABEL: Out.split.loop.exit:
+; CHECK: %[[PHI:.*]] = phi i32 [ %l2, %ContLoop ]
+; CHECK: br label %Out
+;
+; CHECK-LABEL: Out.split.loop.exit1:
+; CHECK: %[[SINKABLE:.*]] = mul i32 %l2.lcssa, %t.le
+; CHECK: br label %Out
+;
+; CHECK-LABEL: Out:
+; CHECK: %idx = phi i32 [ %[[PHI]], %Out.split.loop.exit ], [ %[[SINKABLE]], %Out.split.loop.exit1 ]
+define i32 @test16(i1 %c, i8** %P, i32* %P2, i64 %V) {
+entry:
+ br label %loop.ph
+loop.ph:
+ br label %Loop
+Loop:
+ %iv = phi i64 [ 0, %loop.ph ], [ %next, %ContLoop ]
+ %l2 = call i32 @getv()
+ %t = trunc i64 %iv to i32
+ %sinkable = mul i32 %l2, %t
+ switch i32 %l2, label %ContLoop [
+ i32 32, label %Out
+ i32 46, label %Out
+ i32 95, label %Out
+ ]
+ContLoop:
+ %next = add nuw i64 %iv, 1
+ %c1 = call i1 @getc()
+ br i1 %c1, label %Loop, label %Out
+Out:
+ %idx = phi i32 [ %l2, %ContLoop ], [ %sinkable, %Loop ], [ %sinkable, %Loop ], [ %sinkable, %Loop ]
+ ret i32 %idx
+}
+
+; Sink a sinkable instruction through multiple non-trivially replacable PHIs in
+; differect exit blocks.
+;
+; CHECK-LABEL: @test17
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+;
+; CHECK-LABEL:OutA.split.loop.exit{{.*}}:
+; CHECK: %[[OP1:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop1 ]
+; CHECK: %[[SINKABLE:.*]] = mul i32 %N, %[[OP1]]
+; CHECK: br label %OutA
+;
+; CHECK-LABEL:OutA:
+; CHECK: phi i32{{.*}}[ %[[SINKABLE]], %OutA.split.loop.exit{{.*}} ]
+;
+; CHECK-LABEL:OutB.split.loop.exit{{.*}}:
+; CHECK: %[[OP2:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop2 ]
+; CHECK: %[[SINKABLE2:.*]] = mul i32 %N, %[[OP2]]
+; CHECK: br label %OutB
+;
+; CHECK-LABEL:OutB:
+; CHECK: phi i32 {{.*}}[ %[[SINKABLE2]], %OutB.split.loop.exit{{.*}} ]
+define i32 @test17(i32 %N, i32 %N2) {
+Entry:
+ br label %Loop
+Loop:
+ %N_addr.0.pn = phi i32 [ %dec, %ContLoop3 ], [ %N, %Entry ]
+ %sink.mul = mul i32 %N, %N_addr.0.pn
+ %c0 = call i1 @getc()
+ br i1 %c0 , label %ContLoop1, label %OutA
+ContLoop1:
+ %c1 = call i1 @getc()
+ br i1 %c1, label %ContLoop2, label %OutA
+
+ContLoop2:
+ %c2 = call i1 @getc()
+ br i1 %c2, label %ContLoop3, label %OutB
+ContLoop3:
+ %c3 = call i1 @getc()
+ %dec = add i32 %N_addr.0.pn, -1
+ br i1 %c3, label %Loop, label %OutB
+OutA:
+ %tmp1 = phi i32 [%sink.mul, %ContLoop1], [%N2, %Loop]
+ br label %Out12
+OutB:
+ %tmp2 = phi i32 [%sink.mul, %ContLoop2], [%dec, %ContLoop3]
+ br label %Out12
+Out12:
+ %tmp = phi i32 [%tmp1, %OutA], [%tmp2, %OutB]
+ ret i32 %tmp
+}
+
+
+; Sink a sinkable instruction through both trivially and non-trivially replacable PHIs.
+;
+; CHECK-LABEL: @test18
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+; CHECK-NOT: sub
+;
+; CHECK-LABEL:Out12.split.loop.exit:
+; CHECK: %[[OP:.*]] = phi i32 [ %iv, %ContLoop ]
+; CHECK: %[[DEC:.*]] = phi i32 [ %dec, %ContLoop ]
+; CHECK: %[[SINKMUL:.*]] = mul i32 %N, %[[OP]]
+; CHECK: %[[SINKSUB:.*]] = sub i32 %[[SINKMUL]], %N2
+; CHECK: br label %Out12
+;
+; CHECK-LABEL:Out12.split.loop.exit1:
+; CHECK: %[[OP2:.*]] = phi i32 [ %iv, %Loop ]
+; CHECK: %[[SINKMUL2:.*]] = mul i32 %N, %[[OP2]]
+; CHECK: %[[SINKSUB2:.*]] = sub i32 %[[SINKMUL2]], %N2
+; CHECK: br label %Out12
+;
+; CHECK-LABEL:Out12:
+; CHECK: %tmp1 = phi i32 [ %[[SINKSUB]], %Out12.split.loop.exit ], [ %[[SINKSUB2]], %Out12.split.loop.exit1 ]
+; CHECK: %tmp2 = phi i32 [ %[[DEC]], %Out12.split.loop.exit ], [ %[[SINKSUB2]], %Out12.split.loop.exit1 ]
+; CHECK: %add = add i32 %tmp1, %tmp2
+define i32 @test18(i32 %N, i32 %N2) {
+Entry:
+ br label %Loop
+Loop:
+ %iv = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ]
+ %sink.mul = mul i32 %N, %iv
+ %sink.sub = sub i32 %sink.mul, %N2
+ %c0 = call i1 @getc()
+ br i1 %c0, label %ContLoop, label %Out12
+ContLoop:
+ %dec = add i32 %iv, -1
+ %c1 = call i1 @getc()
+ br i1 %c1, label %Loop, label %Out12
+Out12:
+ %tmp1 = phi i32 [%sink.sub, %ContLoop], [%sink.sub, %Loop]
+ %tmp2 = phi i32 [%dec, %ContLoop], [%sink.sub, %Loop]
+ %add = add i32 %tmp1, %tmp2
+ ret i32 %add
+}
+
+; Do not sink an instruction through a non-trivially replacable PHI, to avoid
+; assert while splitting predecessors, if the terminator of predecessor is an
+; indirectbr.
+; CHECK-LABEL: @test19
+; CHECK-LABEL: L0:
+; CHECK: %sinkable = mul
+; CHECK: %sinkable2 = add
+
+define i32 @test19(i1 %cond, i1 %cond2, i8* %address, i32 %v1) nounwind {
+entry:
+ br label %L0
+L0:
+ %indirect.goto.dest = select i1 %cond, i8* blockaddress(@test19, %exit), i8* %address
+ %v2 = call i32 @getv()
+ %sinkable = mul i32 %v1, %v2
+ %sinkable2 = add i32 %v1, %v2
+ indirectbr i8* %indirect.goto.dest, [label %L1, label %exit]
+
+L1:
+ %indirect.goto.dest2 = select i1 %cond2, i8* blockaddress(@test19, %exit), i8* %address
+ indirectbr i8* %indirect.goto.dest2, [label %L0, label %exit]
+
+exit:
+ %r = phi i32 [%sinkable, %L0], [%sinkable2, %L1]
+ ret i32 %r
+}
+
+; Do not sink through a non-trivially replacable PHI if splitting predecessors
+; not allowed in SplitBlockPredecessors().
+;
+; CHECK-LABEL: @test20
+; CHECK-LABEL: while.cond
+; CHECK: %sinkable = mul
+; CHECK: %sinkable2 = add
+define void @test20(i32* %s, i1 %b, i32 %v1, i32 %v2) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+ br label %while.cond
+while.cond:
+ %v = call i32 @getv()
+ %sinkable = mul i32 %v, %v2
+ %sinkable2 = add i32 %v, %v2
+ br i1 %b, label %try.cont, label %while.body
+while.body:
+ invoke void @may_throw()
+ to label %while.body2 unwind label %catch.dispatch
+while.body2:
+ invoke void @may_throw2()
+ to label %while.cond unwind label %catch.dispatch
+catch.dispatch:
+ %.lcssa1 = phi i32 [ %sinkable, %while.body ], [ %sinkable2, %while.body2 ]
+ %cp = cleanuppad within none []
+ store i32 %.lcssa1, i32* %s
+ cleanupret from %cp unwind to caller
+try.cont:
+ ret void
+}
+
+declare void @may_throw()
+declare void @may_throw2()
+declare i32 @__CxxFrameHandler3(...)
+declare i32 @getv()
+declare i1 @getc()
+declare void @f(i32*)
declare void @g()
diff --git a/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll b/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
new file mode 100644
index 00000000000..3c283dcb6e5
--- /dev/null
+++ b/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
@@ -0,0 +1,46 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S < %s | \
+; RUN: FileCheck %s
+;
+; The GPU Load & Store Vectorizer may merge differently-typed accesses into a
+; single instruction. This test checks that we merge TBAA tags for such
+; accesses correctly.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; struct S {
+; float f;
+; int i;
+; };
+%struct.S = type { float, i32 }
+
+; float foo(S *p) {
+; p->f -= 1;
+; p->i -= 1;
+; return p->f;
+; }
+define float @foo(%struct.S* %p) {
+entry:
+; CHECK-LABEL: foo
+; CHECK: load <2 x i32>, {{.*}}, !tbaa [[TAG_char:!.*]]
+; CHECK: store <2 x i32> {{.*}}, !tbaa [[TAG_char]]
+ %f = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 0
+ %0 = load float, float* %f, align 4, !tbaa !2
+ %sub = fadd float %0, -1.000000e+00
+ store float %sub, float* %f, align 4, !tbaa !2
+ %i = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 1
+ %1 = load i32, i32* %i, align 4, !tbaa !8
+ %sub1 = add nsw i32 %1, -1
+ store i32 %sub1, i32* %i, align 4, !tbaa !8
+ ret float %sub
+}
+
+!2 = !{!3, !4, i64 0}
+!3 = !{!"_ZTS1S", !4, i64 0, !7, i64 4}
+!4 = !{!"float", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C++ TBAA"}
+!7 = !{!"int", !5, i64 0}
+!8 = !{!3, !7, i64 4}
+
+; CHECK-DAG: [[TYPE_char:!.*]] = !{!"omnipotent char", {{.*}}, i64 0}
+; CHECK-FAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0}
diff --git a/test/Transforms/LoopPredication/widened.ll b/test/Transforms/LoopPredication/widened.ll
new file mode 100644
index 00000000000..33c4e270613
--- /dev/null
+++ b/test/Transforms/LoopPredication/widened.ll
@@ -0,0 +1,138 @@
+; RUN: opt -S -loop-predication -loop-predication-enable-iv-truncation=true < %s 2>&1 | FileCheck %s
+declare void @llvm.experimental.guard(i1, ...)
+
+declare i32 @length(i8*)
+
+declare i16 @short_length(i8*)
+; Consider range check of type i16 and i32, while IV is of type i64
+; We can loop predicate this because the IV range is within i16 and within i32.
+define i64 @iv_wider_type_rc_two_narrow_types(i32 %offA, i16 %offB, i8* %arrA, i8* %arrB) {
+; CHECK-LABEL: iv_wider_type_rc_two_narrow_types
+entry:
+; CHECK-LABEL: entry:
+; CHECK: [[idxB:[^ ]+]] = sub i16 %lengthB, %offB
+; CHECK-NEXT: [[limit_checkB:[^ ]+]] = icmp ule i16 16, [[idxB]]
+; CHECK-NEXT: [[first_iteration_checkB:[^ ]+]] = icmp ult i16 %offB, %lengthB
+; CHECK-NEXT: [[WideChkB:[^ ]+]] = and i1 [[first_iteration_checkB]], [[limit_checkB]]
+; CHECK-NEXT: [[idxA:[^ ]+]] = sub i32 %lengthA, %offA
+; CHECK-NEXT: [[limit_checkA:[^ ]+]] = icmp ule i32 16, [[idxA]]
+; CHECK-NEXT: [[first_iteration_checkA:[^ ]+]] = icmp ult i32 %offA, %lengthA
+; CHECK-NEXT: [[WideChkA:[^ ]+]] = and i1 [[first_iteration_checkA]], [[limit_checkA]]
+ %lengthA = call i32 @length(i8* %arrA)
+ %lengthB = call i16 @short_length(i8* %arrB)
+ br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK: [[invariant_check:[^ ]+]] = and i1 [[WideChkB]], [[WideChkA]]
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[invariant_check]], i32 9)
+ %iv = phi i64 [0, %entry ], [ %iv.next, %loop ]
+ %iv.trunc.32 = trunc i64 %iv to i32
+ %iv.trunc.16 = trunc i64 %iv to i16
+ %indexA = add i32 %iv.trunc.32, %offA
+ %indexB = add i16 %iv.trunc.16, %offB
+ %rcA = icmp ult i32 %indexA, %lengthA
+ %rcB = icmp ult i16 %indexB, %lengthB
+ %wide.chk = and i1 %rcA, %rcB
+ call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk, i32 9) [ "deopt"() ]
+ %indexA.ext = zext i32 %indexA to i64
+ %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext
+ %eltA = load i8, i8* %addrA
+ %indexB.ext = zext i16 %indexB to i64
+ %addrB = getelementptr inbounds i8, i8* %arrB, i64 %indexB.ext
+ store i8 %eltA, i8* %addrB
+ %iv.next = add nuw nsw i64 %iv, 1
+ %latch.check = icmp ult i64 %iv.next, 16
+ br i1 %latch.check, label %loop, label %exit
+
+exit:
+ ret i64 %iv
+}
+
+
+; Consider an IV of type long and an array access into int array.
+; IV is of type i64 while the range check operands are of type i32 and i64.
+define i64 @iv_rc_different_types(i32 %offA, i32 %offB, i8* %arrA, i8* %arrB, i64 %max)
+{
+; CHECK-LABEL: iv_rc_different_types
+entry:
+; CHECK-LABEL: entry:
+; CHECK: [[lenB:[^ ]+]] = add i32 %lengthB, -1
+; CHECK-NEXT: [[idxB:[^ ]+]] = sub i32 [[lenB]], %offB
+; CHECK-NEXT: [[limit_checkB:[^ ]+]] = icmp ule i32 15, [[idxB]]
+; CHECK-NEXT: [[first_iteration_checkB:[^ ]+]] = icmp ult i32 %offB, %lengthB
+; CHECK-NEXT: [[WideChkB:[^ ]+]] = and i1 [[first_iteration_checkB]], [[limit_checkB]]
+; CHECK-NEXT: [[maxMinusOne:[^ ]+]] = add i64 %max, -1
+; CHECK-NEXT: [[limit_checkMax:[^ ]+]] = icmp ule i64 15, [[maxMinusOne]]
+; CHECK-NEXT: [[first_iteration_checkMax:[^ ]+]] = icmp ult i64 0, %max
+; CHECK-NEXT: [[WideChkMax:[^ ]+]] = and i1 [[first_iteration_checkMax]], [[limit_checkMax]]
+; CHECK-NEXT: [[lenA:[^ ]+]] = add i32 %lengthA, -1
+; CHECK-NEXT: [[idxA:[^ ]+]] = sub i32 [[lenA]], %offA
+; CHECK-NEXT: [[limit_checkA:[^ ]+]] = icmp ule i32 15, [[idxA]]
+; CHECK-NEXT: [[first_iteration_checkA:[^ ]+]] = icmp ult i32 %offA, %lengthA
+; CHECK-NEXT: [[WideChkA:[^ ]+]] = and i1 [[first_iteration_checkA]], [[limit_checkA]]
+ %lengthA = call i32 @length(i8* %arrA)
+ %lengthB = call i32 @length(i8* %arrB)
+ br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK: [[BandMax:[^ ]+]] = and i1 [[WideChkB]], [[WideChkMax]]
+; CHECK: [[ABandMax:[^ ]+]] = and i1 [[BandMax]], [[WideChkA]]
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[ABandMax]], i32 9)
+ %iv = phi i64 [0, %entry ], [ %iv.next, %loop ]
+ %iv.trunc = trunc i64 %iv to i32
+ %indexA = add i32 %iv.trunc, %offA
+ %indexB = add i32 %iv.trunc, %offB
+ %rcA = icmp ult i32 %indexA, %lengthA
+ %rcIV = icmp ult i64 %iv, %max
+ %wide.chk = and i1 %rcA, %rcIV
+ %rcB = icmp ult i32 %indexB, %lengthB
+ %wide.chk.final = and i1 %wide.chk, %rcB
+ call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk.final, i32 9) [ "deopt"() ]
+ %indexA.ext = zext i32 %indexA to i64
+ %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext
+ %eltA = load i8, i8* %addrA
+ %indexB.ext = zext i32 %indexB to i64
+ %addrB = getelementptr inbounds i8, i8* %arrB, i64 %indexB.ext
+ %eltB = load i8, i8* %addrB
+ %result = xor i8 %eltA, %eltB
+ store i8 %result, i8* %addrA
+ %iv.next = add nuw nsw i64 %iv, 1
+ %latch.check = icmp ult i64 %iv, 15
+ br i1 %latch.check, label %loop, label %exit
+
+exit:
+ ret i64 %iv
+}
+
+; cannot narrow the IV to the range type, because we lose information.
+; for (i64 i= 5; i>= 2; i++)
+; this loop wraps around after reaching 2^64.
+define i64 @iv_rc_different_type(i32 %offA, i8* %arrA) {
+; CHECK-LABEL: iv_rc_different_type
+entry:
+ %lengthA = call i32 @length(i8* %arrA)
+ br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK: %rcA = icmp ult i32 %indexA, %lengthA
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %rcA, i32 9)
+ %iv = phi i64 [ 5, %entry ], [ %iv.next, %loop ]
+ %iv.trunc.32 = trunc i64 %iv to i32
+ %indexA = add i32 %iv.trunc.32, %offA
+ %rcA = icmp ult i32 %indexA, %lengthA
+ call void (i1, ...) @llvm.experimental.guard(i1 %rcA, i32 9) [ "deopt"() ]
+ %indexA.ext = zext i32 %indexA to i64
+ %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext
+ %eltA = load i8, i8* %addrA
+ %res = add i8 %eltA, 2
+ store i8 %eltA, i8* %addrA
+ %iv.next = add i64 %iv, 1
+ %latch.check = icmp sge i64 %iv.next, 2
+ br i1 %latch.check, label %loop, label %exit
+
+exit:
+ ret i64 %iv
+}
diff --git a/test/Transforms/LoopVectorize/pr34681.ll b/test/Transforms/LoopVectorize/pr34681.ll
new file mode 100644
index 00000000000..e93265e2ed5
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr34681.ll
@@ -0,0 +1,122 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check the scenario where we have an unknown Stride, which happens to also be
+; the loop iteration count, so if we specialize the loop for the Stride==1 case,
+; this also implies that the loop will iterate no more than a single iteration,
+; as in the following example:
+;
+; unsigned int N;
+; int tmp = 0;
+; for(unsigned int k=0;k<N;k++) {
+; tmp+=(int)B[k*N+j];
+; }
+;
+; We check here that the following runtime scev guard for Stride==1 is NOT generated:
+; vector.scevcheck:
+; %ident.check = icmp ne i32 %N, 1
+; %0 = or i1 false, %ident.check
+; br i1 %0, label %scalar.ph, label %vector.ph
+; Instead the loop is vectorized with an unknown stride.
+
+; CHECK-LABEL: @foo1
+; CHECK: for.body.lr.ph
+; CHECK-NOT: %ident.check = icmp ne i32 %N, 1
+; CHECK-NOT: %[[TEST:[0-9]+]] = or i1 false, %ident.check
+; CHECK-NOT: br i1 %[[TEST]], label %scalar.ph, label %vector.ph
+; CHECK: vector.ph
+; CHECK: vector.body
+; CHECK: <4 x i32>
+; CHECK: middle.block
+; CHECK: scalar.ph
+
+
+define i32 @foo1(i32 %N, i16* nocapture readnone %A, i16* nocapture readonly %B, i32 %i, i32 %j) {
+entry:
+ %cmp8 = icmp eq i32 %N, 0
+ br i1 %cmp8, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+ br label %for.body
+
+for.body:
+ %tmp.010 = phi i32 [ 0, %for.body.lr.ph ], [ %add1, %for.body ]
+ %k.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+ %mul = mul i32 %k.09, %N
+ %add = add i32 %mul, %j
+ %arrayidx = getelementptr inbounds i16, i16* %B, i32 %add
+ %0 = load i16, i16* %arrayidx, align 2
+ %conv = sext i16 %0 to i32
+ %add1 = add nsw i32 %tmp.010, %conv
+ %inc = add nuw i32 %k.09, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ %add1.lcssa = phi i32 [ %add1, %for.body ]
+ br label %for.end
+
+for.end:
+ %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add1.lcssa, %for.end.loopexit ]
+ ret i32 %tmp.0.lcssa
+}
+
+
+; Check the same, but also where the Stride and the loop iteration count
+; are not of the same data type.
+;
+; unsigned short N;
+; int tmp = 0;
+; for(unsigned int k=0;k<N;k++) {
+; tmp+=(int)B[k*N+j];
+; }
+;
+; We check here that the following runtime scev guard for Stride==1 is NOT generated:
+; vector.scevcheck:
+; %ident.check = icmp ne i16 %N, 1
+; %0 = or i1 false, %ident.check
+; br i1 %0, label %scalar.ph, label %vector.ph
+
+
+; CHECK-LABEL: @foo2
+; CHECK: for.body.lr.ph
+; CHECK-NOT: %ident.check = icmp ne i16 %N, 1
+; CHECK-NOT: %[[TEST:[0-9]+]] = or i1 false, %ident.check
+; CHECK-NOT: br i1 %[[TEST]], label %scalar.ph, label %vector.ph
+; CHECK: vector.ph
+; CHECK: vector.body
+; CHECK: <4 x i32>
+; CHECK: middle.block
+; CHECK: scalar.ph
+
+define i32 @foo2(i16 zeroext %N, i16* nocapture readnone %A, i16* nocapture readonly %B, i32 %i, i32 %j) {
+entry:
+ %conv = zext i16 %N to i32
+ %cmp11 = icmp eq i16 %N, 0
+ br i1 %cmp11, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+ br label %for.body
+
+for.body:
+ %tmp.013 = phi i32 [ 0, %for.body.lr.ph ], [ %add4, %for.body ]
+ %k.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+ %mul = mul nuw i32 %k.012, %conv
+ %add = add i32 %mul, %j
+ %arrayidx = getelementptr inbounds i16, i16* %B, i32 %add
+ %0 = load i16, i16* %arrayidx, align 2
+ %conv3 = sext i16 %0 to i32
+ %add4 = add nsw i32 %tmp.013, %conv3
+ %inc = add nuw nsw i32 %k.012, 1
+ %exitcond = icmp eq i32 %inc, %conv
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ %add4.lcssa = phi i32 [ %add4, %for.body ]
+ br label %for.end
+
+for.end:
+ %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add4.lcssa, %for.end.loopexit ]
+ ret i32 %tmp.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/version-mem-access.ll b/test/Transforms/LoopVectorize/version-mem-access.ll
index a9d319e5a2d..774b6f26859 100644
--- a/test/Transforms/LoopVectorize/version-mem-access.ll
+++ b/test/Transforms/LoopVectorize/version-mem-access.ll
@@ -65,7 +65,8 @@ for.end:
define void @fn1(double* noalias %x, double* noalias %c, double %a) {
entry:
%conv = fptosi double %a to i32
- %cmp8 = icmp sgt i32 %conv, 0
+ %conv2 = add i32 %conv, 4
+ %cmp8 = icmp sgt i32 %conv2, 0
br i1 %cmp8, label %for.body.preheader, label %for.end
for.body.preheader:
@@ -82,7 +83,7 @@ for.body:
store double %1, double* %arrayidx3, align 8
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
- %exitcond = icmp eq i32 %lftr.wideiv, %conv
+ %exitcond = icmp eq i32 %lftr.wideiv, %conv2
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit:
diff --git a/test/Transforms/LowerTypeTests/blockaddress.ll b/test/Transforms/LowerTypeTests/blockaddress.ll
new file mode 100644
index 00000000000..ecc4814cfd5
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/blockaddress.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S %s -lowertypetests | FileCheck %s
+
+
+; CHECK: define internal i8* @f2.cfi() !type !0 {
+; CHECK-NEXT: br label %b
+; CHECK: b:
+; CHECK-NEXT: ret i8* blockaddress(@f2.cfi, %b)
+; CHECK-NEXT: }
+
+target triple = "x86_64-unknown-linux"
+
+define void @f1() {
+entry:
+ %0 = call i1 @llvm.type.test(i8* bitcast (i8* ()* @f2 to i8*), metadata !"_ZTSFvP3bioE")
+ ret void
+}
+
+declare i1 @llvm.type.test(i8*, metadata)
+
+define i8* @f2() !type !5 {
+ br label %b
+
+b:
+ ret i8* blockaddress(@f2, %b)
+}
+
+!5 = !{i64 0, !"_ZTSFvP3bioE"}
diff --git a/test/Transforms/LowerTypeTests/import-unsat.ll b/test/Transforms/LowerTypeTests/import-unsat.ll
index 6cb9b26fb57..b9eb552dd66 100644
--- a/test/Transforms/LowerTypeTests/import-unsat.ll
+++ b/test/Transforms/LowerTypeTests/import-unsat.ll
@@ -7,6 +7,7 @@
; SUMMARY-NEXT: - Linkage: 0
; SUMMARY-NEXT: NotEligibleToImport: false
; SUMMARY-NEXT: Live: true
+; SUMMARY-NEXT: Local: false
; SUMMARY-NEXT: TypeTests: [ 123 ]
; SUMMARY-NEXT: TypeIdMap:
; SUMMARY-NEXT: typeid1:
diff --git a/test/Transforms/PGOProfile/Inputs/irreducible.proftext b/test/Transforms/PGOProfile/Inputs/irreducible.proftext
new file mode 100644
index 00000000000..9b0210d9a30
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/irreducible.proftext
@@ -0,0 +1,29 @@
+:ir
+_Z11irreducibleii
+# Func Hash:
+64451410787
+# Num Counters:
+6
+# Counter Values:
+1000
+950
+100
+373
+1
+0
+
+_Z11irreduciblePh
+# Func Hash:
+104649601521
+# Num Counters:
+9
+# Counter Values:
+100
+300
+99
+300
+201
+1
+1
+0
+0
diff --git a/test/Transforms/PGOProfile/irreducible.ll b/test/Transforms/PGOProfile/irreducible.ll
new file mode 100644
index 00000000000..37f6e206ee9
--- /dev/null
+++ b/test/Transforms/PGOProfile/irreducible.ll
@@ -0,0 +1,184 @@
+; RUN: llvm-profdata merge %S/Inputs/irreducible.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
+
+; GEN: $__llvm_profile_raw_version = comdat any
+
+; Function Attrs: noinline norecurse nounwind readnone uwtable
+define i32 @_Z11irreducibleii(i32 %iter_outer, i32 %iter_inner) local_unnamed_addr #0 {
+entry:
+ %cmp24 = icmp sgt i32 %iter_outer, 0
+ br i1 %cmp24, label %for.body, label %entry.for.cond.cleanup_crit_edge
+
+entry.for.cond.cleanup_crit_edge: ; preds = %entry
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %entry.for.cond.cleanup_crit_edge, %for.end
+ %sum.0.lcssa = phi i32 [ 0, %entry.for.cond.cleanup_crit_edge ], [ %sum.1, %for.end ]
+ ret i32 %sum.0.lcssa
+
+for.body: ; preds = %entry, %for.end
+ %k.026 = phi i32 [ %inc12, %for.end ], [ 0, %entry ]
+ %sum.025 = phi i32 [ %sum.1, %for.end ], [ 0, %entry ]
+ %rem23 = and i32 %k.026, 1
+ %cmp1 = icmp eq i32 %rem23, 0
+ br i1 %cmp1, label %entry8, label %for.cond2
+
+for.cond2: ; preds = %for.body, %if.end9
+ %sum.1 = phi i32 [ %add10, %if.end9 ], [ %sum.025, %for.body ]
+ %i.0 = phi i32 [ %inc, %if.end9 ], [ 0, %for.body ]
+ %cmp3 = icmp slt i32 %i.0, %iter_inner
+ br i1 %cmp3, label %for.body4, label %for.end
+; USE: br i1 %cmp3, label %for.body4, label %for.end, !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[FOR_COND2_IRR_LOOP:[0-9]+]]
+
+for.body4: ; preds = %for.cond2
+ %rem5 = srem i32 %k.026, 3
+ %cmp6 = icmp eq i32 %rem5, 0
+ br i1 %cmp6, label %entry8, label %if.end9
+
+entry8: ; preds = %for.body4, %for.body
+ %sum.2 = phi i32 [ %sum.025, %for.body ], [ %sum.1, %for.body4 ]
+ %i.1 = phi i32 [ 0, %for.body ], [ %i.0, %for.body4 ]
+ %add = add nsw i32 %sum.2, 4
+ br label %if.end9
+; USE: br label %if.end9,
+; USE-SAME: !irr_loop ![[ENTRY8_IRR_LOOP:[0-9]+]]
+
+if.end9: ; preds = %entry8, %for.body4
+ %sum.3 = phi i32 [ %add, %entry8 ], [ %sum.1, %for.body4 ]
+ %i.2 = phi i32 [ %i.1, %entry8 ], [ %i.0, %for.body4 ]
+ %add10 = add nsw i32 %sum.3, 1
+ %inc = add nsw i32 %i.2, 1
+ br label %for.cond2
+; USE: br label %for.cond2,
+; USE-SAME: !irr_loop ![[IF_END9_IRR_LOOP:[0-9]+]]
+
+for.end: ; preds = %for.cond2
+ %inc12 = add nuw nsw i32 %k.026, 1
+ %exitcond = icmp eq i32 %inc12, %iter_outer
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+
+
+@targets = local_unnamed_addr global [256 x i8*] zeroinitializer, align 16
+@tracing = local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: noinline norecurse nounwind uwtable
+define i32 @_Z11irreduciblePh(i8* nocapture readonly %p) {
+entry:
+ store <2 x i8*> <i8* blockaddress(@_Z11irreduciblePh, %sw.bb), i8* blockaddress(@_Z11irreduciblePh, %TARGET_1)>, <2 x i8*>* bitcast ([256 x i8*]* @targets to <2 x i8*>*), align 16
+ store i8* blockaddress(@_Z11irreduciblePh, %TARGET_2), i8** getelementptr inbounds ([256 x i8*], [256 x i8*]* @targets, i64 0, i64 2), align 16
+ %0 = load i32, i32* @tracing, align 4
+ %tobool = icmp eq i32 %0, 0
+ br label %for.cond1
+
+for.cond1: ; preds = %sw.default, %entry
+ %p.addr.0 = phi i8* [ %p, %entry ], [ %p.addr.4, %sw.default ]
+ %sum.0 = phi i32 [ 0, %entry ], [ %add25, %sw.default ]
+ %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1
+ %1 = load i8, i8* %p.addr.0, align 1
+ %incdec.ptr2 = getelementptr inbounds i8, i8* %p.addr.0, i64 2
+ %2 = load i8, i8* %incdec.ptr, align 1
+ %conv3 = zext i8 %2 to i32
+ br label %dispatch_op
+
+dispatch_op: ; preds = %sw.bb6, %for.cond1
+ %p.addr.1 = phi i8* [ %incdec.ptr2, %for.cond1 ], [ %p.addr.2, %sw.bb6 ]
+ %op.0 = phi i8 [ %1, %for.cond1 ], [ 1, %sw.bb6 ]
+ %oparg.0 = phi i32 [ %conv3, %for.cond1 ], [ %oparg.2, %sw.bb6 ]
+ %sum.1 = phi i32 [ %sum.0, %for.cond1 ], [ %add7, %sw.bb6 ]
+ switch i8 %op.0, label %sw.default [
+ i8 0, label %sw.bb
+ i8 1, label %dispatch_op.sw.bb6_crit_edge
+ i8 2, label %sw.bb15
+ ]
+
+dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op
+ br label %sw.bb6
+
+sw.bb: ; preds = %indirectgoto, %dispatch_op
+ %oparg.1 = phi i32 [ %oparg.0, %dispatch_op ], [ 0, %indirectgoto ]
+ %sum.2 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %indirectgoto ]
+ %add.neg = sub i32 -5, %oparg.1
+ %sub = add i32 %add.neg, %sum.2
+ br label %exit
+
+TARGET_1: ; preds = %indirectgoto
+ %incdec.ptr4 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
+ %3 = load i8, i8* %p.addr.5, align 1
+ %conv5 = zext i8 %3 to i32
+ br label %sw.bb6
+
+sw.bb6: ; preds = %dispatch_op.sw.bb6_crit_edge, %TARGET_1
+ %p.addr.2 = phi i8* [ %incdec.ptr4, %TARGET_1 ], [ %p.addr.1, %dispatch_op.sw.bb6_crit_edge ]
+ %oparg.2 = phi i32 [ %conv5, %TARGET_1 ], [ %oparg.0, %dispatch_op.sw.bb6_crit_edge ]
+ %sum.3 = phi i32 [ %sum.7, %TARGET_1 ], [ %sum.1, %dispatch_op.sw.bb6_crit_edge ]
+ %mul = mul nsw i32 %oparg.2, 7
+ %add7 = add nsw i32 %sum.3, %mul
+ %rem46 = and i32 %add7, 1
+ %cmp8 = icmp eq i32 %rem46, 0
+ br i1 %cmp8, label %dispatch_op, label %if.then
+; USE: br i1 %cmp8, label %dispatch_op, label %if.then, !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[SW_BB6_IRR_LOOP:[0-9]+]]
+
+if.then: ; preds = %sw.bb6
+ %mul9 = mul nsw i32 %add7, 9
+ br label %indirectgoto
+
+TARGET_2: ; preds = %indirectgoto
+ %incdec.ptr13 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
+ %4 = load i8, i8* %p.addr.5, align 1
+ %conv14 = zext i8 %4 to i32
+ br label %sw.bb15
+
+sw.bb15: ; preds = %TARGET_2, %dispatch_op
+ %p.addr.3 = phi i8* [ %p.addr.1, %dispatch_op ], [ %incdec.ptr13, %TARGET_2 ]
+ %oparg.3 = phi i32 [ %oparg.0, %dispatch_op ], [ %conv14, %TARGET_2 ]
+ %sum.4 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %TARGET_2 ]
+ %add16 = add nsw i32 %oparg.3, 3
+ %add17 = add nsw i32 %add16, %sum.4
+ br i1 %tobool, label %if.then18, label %exit
+; USE: br i1 %tobool, label %if.then18, label %exit, !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[SW_BB15_IRR_LOOP:[0-9]+]]
+
+if.then18: ; preds = %sw.bb15
+ %idx.ext = sext i32 %oparg.3 to i64
+ %add.ptr = getelementptr inbounds i8, i8* %p.addr.3, i64 %idx.ext
+ %mul19 = mul nsw i32 %add17, 17
+ br label %indirectgoto
+
+unknown_op: ; preds = %indirectgoto
+ %sub24 = add nsw i32 %sum.7, -4
+ br label %sw.default
+
+sw.default: ; preds = %unknown_op, %dispatch_op
+ %p.addr.4 = phi i8* [ %p.addr.5, %unknown_op ], [ %p.addr.1, %dispatch_op ]
+ %sum.5 = phi i32 [ %sub24, %unknown_op ], [ %sum.1, %dispatch_op ]
+ %add25 = add nsw i32 %sum.5, 11
+ br label %for.cond1
+
+exit: ; preds = %sw.bb15, %sw.bb
+ %sum.6 = phi i32 [ %sub, %sw.bb ], [ %add17, %sw.bb15 ]
+ ret i32 %sum.6
+
+indirectgoto: ; preds = %if.then18, %if.then
+ %add.ptr.pn = phi i8* [ %add.ptr, %if.then18 ], [ %p.addr.2, %if.then ]
+ %sum.7 = phi i32 [ %mul19, %if.then18 ], [ %mul9, %if.then ]
+ %p.addr.5 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 1
+ %5 = load i8, i8* %add.ptr.pn, align 1
+ %idxprom21 = zext i8 %5 to i64
+ %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21
+ %6 = load i8*, i8** %arrayidx22, align 8
+ indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2]
+; USE: indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[INDIRECTGOTO_IRR_LOOP:[0-9]+]]
+}
+
+; USE: ![[FOR_COND2_IRR_LOOP]] = !{!"loop_header_weight", i64 1050}
+; USE: ![[ENTRY8_IRR_LOOP]] = !{!"loop_header_weight", i64 373}
+; USE: ![[IF_END9_IRR_LOOP]] = !{!"loop_header_weight", i64 1000}
+; USE: ![[SW_BB6_IRR_LOOP]] = !{!"loop_header_weight", i64 501}
+; USE: ![[SW_BB15_IRR_LOOP]] = !{!"loop_header_weight", i64 100}
+; USE: ![[INDIRECTGOTO_IRR_LOOP]] = !{!"loop_header_weight", i64 400}
diff --git a/test/Transforms/PGOProfile/thinlto_samplepgo_icp2.ll b/test/Transforms/PGOProfile/thinlto_samplepgo_icp2.ll
index c1c074e75a7..1751854d448 100644
--- a/test/Transforms/PGOProfile/thinlto_samplepgo_icp2.ll
+++ b/test/Transforms/PGOProfile/thinlto_samplepgo_icp2.ll
@@ -22,7 +22,7 @@
; RUN: llvm-nm %t3.2 | FileCheck %s --check-prefix=NM
; NM: _ZL3barv
; RUN: llvm-dis < %t3.2.2.internalize.bc | FileCheck %s --check-prefix=INTERNALIZE
-; INTERNALIZE: define void @_ZL3barv
+; INTERNALIZE: define dso_local void @_ZL3barv
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll
index 105afa9def5..ebc15865a67 100644
--- a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll
+++ b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll
@@ -75,6 +75,54 @@ define void @test_dereferenceable(i32 addrspace(1)* addrspace(1)* %p, i32 %x, i3
ret void
}
+; invariant.start allows us to sink the load past the baz statepoint call into taken block, which is
+; incorrect. remove the invariant.start and RAUW undef.
+define void @test_inv_start(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" {
+; CHECK-LABEL: test_inv_start
+; CHECK-NOT: invariant.start
+; CHECK: gc.statepoint
+ %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p
+ %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1)
+ %v2 = load i32, i32 addrspace(1)* %v1
+ call void @baz(i32 %x)
+ br i1 %cond, label %taken, label %untaken
+
+taken:
+ store i32 %v2, i32 addrspace(1)* %q, align 16
+ call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1)
+ ret void
+
+; CHECK-LABEL: untaken:
+; CHECK: gc.statepoint
+untaken:
+ %foo = call i32 @escaping.invariant.start({}* %invst)
+ call void @dummy(i32 %foo)
+ ret void
+}
+
+; invariant.start is removed and the uses are undef'ed.
+define void @test_inv_start2(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" {
+; CHECK-LABEL: test_inv_start2
+; CHECK-NOT: invariant.start
+; CHECK: gc.statepoint
+ %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p
+ %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1)
+ %v2 = load i32, i32 addrspace(1)* %v1
+ call void @baz(i32 %x)
+ br i1 %cond, label %taken, label %untaken
+
+taken:
+ store i32 %v2, i32 addrspace(1)* %q, align 16
+ call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1)
+ ret void
+
+untaken:
+ ret void
+}
+declare {}* @llvm.invariant.start.p1i32(i64, i32 addrspace(1)* nocapture) nounwind readonly
+declare void @llvm.invariant.end.p1i32({}*, i64, i32 addrspace(1)* nocapture) nounwind
+declare i32 @escaping.invariant.start({}*) nounwind
+declare void @dummy(i32)
declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64, i32, void (i32)*, i32, i32, ...)
; Function Attrs: nounwind readonly
diff --git a/test/Transforms/SLPVectorizer/X86/call.ll b/test/Transforms/SLPVectorizer/X86/call.ll
index 03b1e837a0c..8397d348483 100644
--- a/test/Transforms/SLPVectorizer/X86/call.ll
+++ b/test/Transforms/SLPVectorizer/X86/call.ll
@@ -11,133 +11,158 @@ declare double @sqrt(double)
declare i64 @round(i64)
-; CHECK: sin_libm
-; CHECK: call <2 x double> @llvm.sin.v2f64
-; CHECK: ret void
-define void @sin_libm(double* %a, double* %b, double* %c) {
-entry:
- %i0 = load double, double* %a, align 8
- %i1 = load double, double* %b, align 8
- %mul = fmul double %i0, %i1
- %call = tail call double @sin(double %mul) nounwind readnone
- %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
- %i3 = load double, double* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
- %i4 = load double, double* %arrayidx4, align 8
- %mul5 = fmul double %i3, %i4
- %call5 = tail call double @sin(double %mul5) nounwind readnone
- store double %call, double* %c, align 8
- %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
- store double %call5, double* %arrayidx5, align 8
+define void @sin_libm(double* %a, double* %b) {
+; CHECK-LABEL: @sin_libm(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* %a to <2 x double>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* %b to <2 x double>*
+; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load double, double* %a, align 8
+ %idx1 = getelementptr inbounds double, double* %a, i64 1
+ %a1 = load double, double* %idx1, align 8
+ %sin1 = tail call double @sin(double %a0) nounwind readnone
+ %sin2 = tail call double @sin(double %a1) nounwind readnone
+ store double %sin1, double* %b, align 8
+ %idx2 = getelementptr inbounds double, double* %b, i64 1
+ store double %sin2, double* %idx2, align 8
ret void
}
-; CHECK: cos_libm
-; CHECK: call <2 x double> @llvm.cos.v2f64
-; CHECK: ret void
-define void @cos_libm(double* %a, double* %b, double* %c) {
-entry:
- %i0 = load double, double* %a, align 8
- %i1 = load double, double* %b, align 8
- %mul = fmul double %i0, %i1
- %call = tail call double @cos(double %mul) nounwind readnone
- %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
- %i3 = load double, double* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
- %i4 = load double, double* %arrayidx4, align 8
- %mul5 = fmul double %i3, %i4
- %call5 = tail call double @cos(double %mul5) nounwind readnone
- store double %call, double* %c, align 8
- %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
- store double %call5, double* %arrayidx5, align 8
+define void @cos_libm(double* %a, double* %b) {
+; CHECK-LABEL: @cos_libm(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* %a to <2 x double>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.cos.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* %b to <2 x double>*
+; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load double, double* %a, align 8
+ %idx1 = getelementptr inbounds double, double* %a, i64 1
+ %a1 = load double, double* %idx1, align 8
+ %cos1 = tail call double @cos(double %a0) nounwind readnone
+ %cos2 = tail call double @cos(double %a1) nounwind readnone
+ store double %cos1, double* %b, align 8
+ %idx2 = getelementptr inbounds double, double* %b, i64 1
+ store double %cos2, double* %idx2, align 8
ret void
}
-; CHECK: pow_libm
-; CHECK: call <2 x double> @llvm.pow.v2f64
-; CHECK: ret void
-define void @pow_libm(double* %a, double* %b, double* %c) {
-entry:
- %i0 = load double, double* %a, align 8
- %i1 = load double, double* %b, align 8
- %mul = fmul double %i0, %i1
- %call = tail call double @pow(double %mul,double %mul) nounwind readnone
- %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
- %i3 = load double, double* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
- %i4 = load double, double* %arrayidx4, align 8
- %mul5 = fmul double %i3, %i4
- %call5 = tail call double @pow(double %mul5,double %mul5) nounwind readnone
- store double %call, double* %c, align 8
- %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
- store double %call5, double* %arrayidx5, align 8
+define void @pow_libm(double* %a, double* %b) {
+; CHECK-LABEL: @pow_libm(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* %a to <2 x double>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* %b to <2 x double>*
+; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load double, double* %a, align 8
+ %idx1 = getelementptr inbounds double, double* %a, i64 1
+ %a1 = load double, double* %idx1, align 8
+ %pow1 = tail call double @pow(double %a0, double %a0) nounwind readnone
+ %pow2 = tail call double @pow(double %a1, double %a1) nounwind readnone
+ store double %pow1, double* %b, align 8
+ %idx2 = getelementptr inbounds double, double* %b, i64 1
+ store double %pow2, double* %idx2, align 8
ret void
}
-
-; CHECK: exp2_libm
-; CHECK: call <2 x double> @llvm.exp2.v2f64
-; CHECK: ret void
-define void @exp2_libm(double* %a, double* %b, double* %c) {
-entry:
- %i0 = load double, double* %a, align 8
- %i1 = load double, double* %b, align 8
- %mul = fmul double %i0, %i1
- %call = tail call double @exp2(double %mul) nounwind readnone
- %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
- %i3 = load double, double* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
- %i4 = load double, double* %arrayidx4, align 8
- %mul5 = fmul double %i3, %i4
- %call5 = tail call double @exp2(double %mul5) nounwind readnone
- store double %call, double* %c, align 8
- %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
- store double %call5, double* %arrayidx5, align 8
+define void @exp_libm(double* %a, double* %b) {
+; CHECK-LABEL: @exp_libm(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* %a to <2 x double>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.exp2.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* %b to <2 x double>*
+; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load double, double* %a, align 8
+ %idx1 = getelementptr inbounds double, double* %a, i64 1
+ %a1 = load double, double* %idx1, align 8
+ %exp1 = tail call double @exp2(double %a0) nounwind readnone
+ %exp2 = tail call double @exp2(double %a1) nounwind readnone
+ store double %exp1, double* %b, align 8
+ %idx2 = getelementptr inbounds double, double* %b, i64 1
+ store double %exp2, double* %idx2, align 8
ret void
}
-
-; CHECK: sqrt_libm
-; CHECK: call nnan <2 x double> @llvm.sqrt.v2f64
-; CHECK: ret void
-define void @sqrt_libm(double* %a, double* %b, double* %c) {
-entry:
- %i0 = load double, double* %a, align 8
- %i1 = load double, double* %b, align 8
- %mul = fmul double %i0, %i1
- %call = tail call nnan double @sqrt(double %mul) nounwind readnone
- %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
- %i3 = load double, double* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
- %i4 = load double, double* %arrayidx4, align 8
- %mul5 = fmul double %i3, %i4
- %call5 = tail call nnan double @sqrt(double %mul5) nounwind readnone
- store double %call, double* %c, align 8
- %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
- store double %call5, double* %arrayidx5, align 8
+; No fast-math-flags are required to convert sqrt library calls to an intrinsic.
+; We just need to know that errno is not set (readnone).
+
+define void @sqrt_libm_no_errno(double* %a, double* %b) {
+; CHECK-LABEL: @sqrt_libm_no_errno(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* %a to <2 x double>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* %b to <2 x double>*
+; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load double, double* %a, align 8
+ %idx1 = getelementptr inbounds double, double* %a, i64 1
+ %a1 = load double, double* %idx1, align 8
+ %sqrt1 = tail call double @sqrt(double %a0) nounwind readnone
+ %sqrt2 = tail call double @sqrt(double %a1) nounwind readnone
+ store double %sqrt1, double* %b, align 8
+ %idx2 = getelementptr inbounds double, double* %b, i64 1
+ store double %sqrt2, double* %idx2, align 8
ret void
}
+; The sqrt intrinsic does not set errno, but a non-constant sqrt call might, so this can't vectorize.
+; The nnan on the call does not matter because there's no guarantee in the C standard that a negative
+; input would result in a nan output ("On a domain error, the function returns an
+; implementation-defined value.")
+
+define void @sqrt_libm_errno(double* %a, double* %b) {
+; CHECK-LABEL: @sqrt_libm_errno(
+; CHECK-NEXT: [[A0:%.*]] = load double, double* %a, align 8
+; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* %a, i64 1
+; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDX1]], align 8
+; CHECK-NEXT: [[SQRT1:%.*]] = tail call nnan double @sqrt(double [[A0]]) #2
+; CHECK-NEXT: [[SQRT2:%.*]] = tail call nnan double @sqrt(double [[A1]]) #2
+; CHECK-NEXT: store double [[SQRT1]], double* %b, align 8
+; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, double* %b, i64 1
+; CHECK-NEXT: store double [[SQRT2]], double* [[IDX2]], align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load double, double* %a, align 8
+ %idx1 = getelementptr inbounds double, double* %a, i64 1
+ %a1 = load double, double* %idx1, align 8
+ %sqrt1 = tail call nnan double @sqrt(double %a0) nounwind
+ %sqrt2 = tail call nnan double @sqrt(double %a1) nounwind
+ store double %sqrt1, double* %b, align 8
+ %idx2 = getelementptr inbounds double, double* %b, i64 1
+ store double %sqrt2, double* %idx2, align 8
+ ret void
+}
; Negative test case
-; CHECK: round_custom
-; CHECK-NOT: load <4 x i64>
-; CHECK: ret void
-define void @round_custom(i64* %a, i64* %b, i64* %c) {
-entry:
- %i0 = load i64, i64* %a, align 8
- %i1 = load i64, i64* %b, align 8
- %mul = mul i64 %i0, %i1
- %call = tail call i64 @round(i64 %mul) nounwind readnone
- %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 1
- %i3 = load i64, i64* %arrayidx3, align 8
- %arrayidx4 = getelementptr inbounds i64, i64* %b, i64 1
- %i4 = load i64, i64* %arrayidx4, align 8
- %mul5 = mul i64 %i3, %i4
- %call5 = tail call i64 @round(i64 %mul5) nounwind readnone
- store i64 %call, i64* %c, align 8
- %arrayidx5 = getelementptr inbounds i64, i64* %c, i64 1
- store i64 %call5, i64* %arrayidx5, align 8
+define void @round_custom(i64* %a, i64* %b) {
+; CHECK-LABEL: @round_custom(
+; CHECK-NEXT: [[A0:%.*]] = load i64, i64* %a, align 8
+; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds i64, i64* %a, i64 1
+; CHECK-NEXT: [[A1:%.*]] = load i64, i64* [[IDX1]], align 8
+; CHECK-NEXT: [[ROUND1:%.*]] = tail call i64 @round(i64 [[A0]]) #3
+; CHECK-NEXT: [[ROUND2:%.*]] = tail call i64 @round(i64 [[A1]]) #3
+; CHECK-NEXT: store i64 [[ROUND1]], i64* %b, align 8
+; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds i64, i64* %b, i64 1
+; CHECK-NEXT: store i64 [[ROUND2]], i64* [[IDX2]], align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load i64, i64* %a, align 8
+ %idx1 = getelementptr inbounds i64, i64* %a, i64 1
+ %a1 = load i64, i64* %idx1, align 8
+ %round1 = tail call i64 @round(i64 %a0) nounwind readnone
+ %round2 = tail call i64 @round(i64 %a1) nounwind readnone
+ store i64 %round1, i64* %b, align 8
+ %idx2 = getelementptr inbounds i64, i64* %b, i64 1
+ store i64 %round2, i64* %idx2, align 8
ret void
}
diff --git a/test/Transforms/SLPVectorizer/X86/cast.ll b/test/Transforms/SLPVectorizer/X86/cast.ll
index 5d7118753e9..2f9f84948ea 100644
--- a/test/Transforms/SLPVectorizer/X86/cast.ll
+++ b/test/Transforms/SLPVectorizer/X86/cast.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -basicaa -slp-vectorizer -dce -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -basicaa -slp-vectorizer -dce -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -basicaa -slp-vectorizer -dce -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -basicaa -slp-vectorizer -dce -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -14,10 +14,10 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
define i32 @test_sext_4i8_to_4i32(i32* noalias nocapture %A, i8* noalias nocapture %B) {
; CHECK-LABEL: @test_sext_4i8_to_4i32(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* %B to <4 x i8>*
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[B:%.*]] to <4 x i8>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i8> [[TMP1]] to <4 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* %A to <4 x i32>*
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: ret i32 undef
;
@@ -46,10 +46,10 @@ entry:
define i32 @test_zext_4i16_to_4i32(i32* noalias nocapture %A, i16* noalias nocapture %B) {
; CHECK-LABEL: @test_zext_4i16_to_4i32(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* %B to <4 x i16>*
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <4 x i16>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* %A to <4 x i32>*
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: ret i32 undef
;
@@ -76,30 +76,21 @@ entry:
}
define i64 @test_sext_4i16_to_4i64(i64* noalias nocapture %A, i16* noalias nocapture %B) {
-; SSE-LABEL: @test_sext_4i16_to_4i64(
-; SSE-NEXT: entry:
-; SSE-NEXT: [[TMP0:%.*]] = bitcast i16* %B to <2 x i16>*
-; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 1
-; SSE-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i64>
-; SSE-NEXT: [[TMP3:%.*]] = bitcast i64* %A to <2 x i64>*
-; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 4
-; SSE-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* %B, i64 2
-; SSE-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* %A, i64 2
-; SSE-NEXT: [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX5]] to <2 x i16>*
-; SSE-NEXT: [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* [[TMP4]], align 1
-; SSE-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP5]] to <2 x i64>
-; SSE-NEXT: [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX7]] to <2 x i64>*
-; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4
-; SSE-NEXT: ret i64 undef
-;
-; AVX-LABEL: @test_sext_4i16_to_4i64(
-; AVX-NEXT: entry:
-; AVX-NEXT: [[TMP0:%.*]] = bitcast i16* %B to <4 x i16>*
-; AVX-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 1
-; AVX-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i64>
-; AVX-NEXT: [[TMP3:%.*]] = bitcast i64* %A to <4 x i64>*
-; AVX-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 4
-; AVX-NEXT: ret i64 undef
+; CHECK-LABEL: @test_sext_4i16_to_4i64(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <2 x i16>*
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>*
+; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 4
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 2
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX5]] to <2 x i16>*
+; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* [[TMP4]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP5]] to <2 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX7]] to <2 x i64>*
+; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4
+; CHECK-NEXT: ret i64 undef
;
entry:
%0 = load i16, i16* %B, align 1
diff --git a/test/Transforms/SLPVectorizer/X86/load-merge.ll b/test/Transforms/SLPVectorizer/X86/load-merge.ll
new file mode 100644
index 00000000000..df990be073b
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s
+
+;unsigned load_le32(unsigned char *data) {
+; unsigned le32 = (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
+; return le32;
+;}
+
+define i32 @_Z9load_le32Ph(i8* nocapture readonly %data) {
+; CHECK-LABEL: @_Z9load_le32Ph(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[DATA:%.*]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
+; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT: [[SHL3:%.*]] = shl nuw nsw i32 [[CONV2]], 8
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHL3]], [[CONV]]
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 2
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX4]], align 1
+; CHECK-NEXT: [[CONV5:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT: [[SHL6:%.*]] = shl nuw nsw i32 [[CONV5]], 16
+; CHECK-NEXT: [[OR7:%.*]] = or i32 [[OR]], [[SHL6]]
+; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 3
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX8]], align 1
+; CHECK-NEXT: [[CONV9:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT: [[SHL10:%.*]] = shl nuw i32 [[CONV9]], 24
+; CHECK-NEXT: [[OR11:%.*]] = or i32 [[OR7]], [[SHL10]]
+; CHECK-NEXT: ret i32 [[OR11]]
+;
+entry:
+ %0 = load i8, i8* %data, align 1
+ %conv = zext i8 %0 to i32
+ %arrayidx1 = getelementptr inbounds i8, i8* %data, i64 1
+ %1 = load i8, i8* %arrayidx1, align 1
+ %conv2 = zext i8 %1 to i32
+ %shl3 = shl nuw nsw i32 %conv2, 8
+ %or = or i32 %shl3, %conv
+ %arrayidx4 = getelementptr inbounds i8, i8* %data, i64 2
+ %2 = load i8, i8* %arrayidx4, align 1
+ %conv5 = zext i8 %2 to i32
+ %shl6 = shl nuw nsw i32 %conv5, 16
+ %or7 = or i32 %or, %shl6
+ %arrayidx8 = getelementptr inbounds i8, i8* %data, i64 3
+ %3 = load i8, i8* %arrayidx8, align 1
+ %conv9 = zext i8 %3 to i32
+ %shl10 = shl nuw i32 %conv9, 24
+ %or11 = or i32 %or7, %shl10
+ ret i32 %or11
+}
diff --git a/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll b/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
new file mode 100644
index 00000000000..79fb782db8f
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
+
+;void Distance(float *p1, int p2, unsigned long p3[], float p4[]) {
+; long a = p3[0] = 5;
+; p1 += p2;
+; p4[3] += p1[a];
+; p3[0] >>= 5;
+; p3[1] >>= 5;
+; p3[2] >>= 5;
+; p3[3] >>= 5;
+; p1 += p2;
+; p4[0] += p1[p3[0] & a];
+;}
+
+define void @_Z8DistanceIlLi5EEvPfiPmS0_(float* %p1, i32 %p2, i64* %p3, float* %p4) {
+; CHECK-LABEL: @_Z8DistanceIlLi5EEvPfiPmS0_(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: store i64 5, i64* [[P3:%.*]], align 8
+; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[P2:%.*]] to i64
+; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[P1:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 5
+; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX1]], align 4
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[P4:%.*]], i64 3
+; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
+; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* [[P3]], align 8
+; CHECK-NEXT: [[SHR:%.*]] = lshr i64 [[TMP2]], 5
+; CHECK-NEXT: store i64 [[SHR]], i64* [[P3]], align 8
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ARRAYIDX4]] to <2 x i64>*
+; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = lshr <2 x i64> [[TMP4]], <i64 5, i64 5>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX4]] to <2 x i64>*
+; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8
+; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
+; CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[ARRAYIDX8]], align 8
+; CHECK-NEXT: [[SHR9:%.*]] = lshr i64 [[TMP7]], 5
+; CHECK-NEXT: store i64 [[SHR9]], i64* [[ARRAYIDX8]], align 8
+; CHECK-NEXT: [[ADD_PTR11:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT: [[AND:%.*]] = and i64 [[SHR]], 5
+; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[ADD_PTR11]], i64 [[AND]]
+; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX13]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[P4]], align 4
+; CHECK-NEXT: [[ADD15:%.*]] = fadd float [[TMP8]], [[TMP9]]
+; CHECK-NEXT: store float [[ADD15]], float* [[P4]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ store i64 5, i64* %p3, align 8
+ %idx.ext = sext i32 %p2 to i64
+ %add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext
+ %arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5
+ %0 = load float, float* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds float, float* %p4, i64 3
+ %1 = load float, float* %arrayidx2, align 4
+ %add = fadd float %0, %1
+ store float %add, float* %arrayidx2, align 4
+ %2 = load i64, i64* %p3, align 8
+ %shr = lshr i64 %2, 5
+ store i64 %shr, i64* %p3, align 8
+ %arrayidx4 = getelementptr inbounds i64, i64* %p3, i64 1
+ %3 = load i64, i64* %arrayidx4, align 8
+ %shr5 = lshr i64 %3, 5
+ store i64 %shr5, i64* %arrayidx4, align 8
+ %arrayidx6 = getelementptr inbounds i64, i64* %p3, i64 2
+ %4 = load i64, i64* %arrayidx6, align 8
+ %shr7 = lshr i64 %4, 5
+ store i64 %shr7, i64* %arrayidx6, align 8
+ %arrayidx8 = getelementptr inbounds i64, i64* %p3, i64 3
+ %5 = load i64, i64* %arrayidx8, align 8
+ %shr9 = lshr i64 %5, 5
+ store i64 %shr9, i64* %arrayidx8, align 8
+ %add.ptr11 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext
+ %and = and i64 %shr, 5
+ %arrayidx13 = getelementptr inbounds float, float* %add.ptr11, i64 %and
+ %6 = load float, float* %arrayidx13, align 4
+ %7 = load float, float* %p4, align 4
+ %add15 = fadd float %6, %7
+ store float %add15, float* %p4, align 4
+ ret void
+}
diff --git a/test/Transforms/SampleProfile/indirect-call.ll b/test/Transforms/SampleProfile/indirect-call.ll
index 61a1bc51996..0c00639e6c0 100644
--- a/test/Transforms/SampleProfile/indirect-call.ll
+++ b/test/Transforms/SampleProfile/indirect-call.ll
@@ -182,7 +182,7 @@ define void @test_direct() !dbg !22 {
; CHECK: ![[PROF]] = !{!"VP", i32 0, i64 3457, i64 9191153033785521275, i64 2059, i64 -1069303473483922844, i64 1398}
; CHECK: ![[BR1]] = !{!"branch_weights", i32 4000, i32 4000}
; CHECK: ![[BR2]] = !{!"branch_weights", i32 3000, i32 1000}
-; CHECK: ![[VP]] = !{!"VP", i32 0, i64 1000, i64 -6391416044382067764, i64 1000}
+; CHECK: ![[VP]] = !{!"VP", i32 0, i64 8000, i64 -6391416044382067764, i64 1000}
!6 = distinct !DISubprogram(name: "test_inline", scope: !1, file: !1, line: 6, unit: !0)
!7 = !DILocation(line: 7, scope: !6)
!8 = distinct !DISubprogram(name: "test_inline_strip", scope: !1, file: !1, line: 8, unit: !0)
diff --git a/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll b/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll
index a2b94038001..a2ca63d0a2d 100644
--- a/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll
+++ b/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S < %s -simplifycfg -simplifycfg-merge-cond-stores=true -simplifycfg-merge-cond-stores-aggressively=false -phi-node-folding-threshold=2 | FileCheck %s
+; RUN: opt -S < %s -simplifycfg -simplifycfg-merge-cond-stores=true -simplifycfg-merge-cond-stores-aggressively=false -phi-node-folding-threshold=1 | FileCheck %s
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "armv7--linux-gnueabihf"
diff --git a/test/Transforms/WholeProgramDevirt/import-indir.ll b/test/Transforms/WholeProgramDevirt/import-indir.ll
index 052a3494834..927ee16b370 100644
--- a/test/Transforms/WholeProgramDevirt/import-indir.ll
+++ b/test/Transforms/WholeProgramDevirt/import-indir.ll
@@ -7,6 +7,7 @@
; SUMMARY-NEXT: - Linkage: 0
; SUMMARY-NEXT: NotEligibleToImport: false
; SUMMARY-NEXT: Live: true
+; SUMMARY-NEXT: Local: false
; SUMMARY-NEXT: TypeTestAssumeVCalls:
; SUMMARY-NEXT: - GUID: 123
; SUMMARY-NEXT: Offset: 0
diff --git a/test/lit.cfg.py b/test/lit.cfg.py
index 6a5cf69b987..73a3b4b58a8 100644
--- a/test/lit.cfg.py
+++ b/test/lit.cfg.py
@@ -168,6 +168,10 @@ for arch in config.targets_to_build.split():
config.available_features.add(arch.lower() + '-registered-target')
# Features
+known_arches = ["x86_64", "mips64", "ppc64", "aarch64"]
+if (config.host_ldflags.find("-m32") < 0
+ and any(config.llvm_host_triple.startswith(x) for x in known_arches)):
+ config.available_features.add("llvm-64-bits")
# Others/can-execute.txt
if sys.platform not in ['win32']:
diff --git a/test/lit.site.cfg.py.in b/test/lit.site.cfg.py.in
index 19e5cd0d3c2..dff46dcff32 100644
--- a/test/lit.site.cfg.py.in
+++ b/test/lit.site.cfg.py.in
@@ -29,7 +29,6 @@ config.targets_to_build = "@TARGETS_TO_BUILD@"
config.native_target = "@LLVM_NATIVE_ARCH@"
config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')
config.host_os = "@HOST_OS@"
-config.host_arch = "@HOST_ARCH@"
config.host_cc = "@HOST_CC@"
config.host_cxx = "@HOST_CXX@"
config.host_ldflags = "@HOST_LDFLAGS@"
@@ -42,6 +41,8 @@ config.enable_ffi = @LLVM_ENABLE_FFI@
config.build_shared_libs = @BUILD_SHARED_LIBS@
config.link_llvm_dylib = @LLVM_LINK_LLVM_DYLIB@
config.llvm_libxml2_enabled = "@LLVM_LIBXML2_ENABLED@"
+config.llvm_host_triple = '@LLVM_HOST_TRIPLE@'
+config.host_arch = "@HOST_ARCH@"
# Support substitution of the tools_dir with user parameters. This is
# used when we can't determine the tool dir at configuration time.
diff --git a/test/tools/dsymutil/cmdline.test b/test/tools/dsymutil/cmdline.test
index dea28cf3d90..f66858e9ae5 100644
--- a/test/tools/dsymutil/cmdline.test
+++ b/test/tools/dsymutil/cmdline.test
@@ -3,7 +3,7 @@ HELP: OVERVIEW: manipulate archived DWARF debug symbol files.
HELP: USAGE: llvm-dsymutil{{[^ ]*}} [options] <input files>
HELP-NOT: -reverse-iterate
HELP: Specific Options:
-HELP: -arch=<string>
+HELP: -arch=<arch>
HELP: -dump-debug-map
HELP: -flat
HELP: -no-odr
diff --git a/test/tools/gold/X86/asm_undefined2.ll b/test/tools/gold/X86/asm_undefined2.ll
index a170f45a55a..d6ed55a775a 100644
--- a/test/tools/gold/X86/asm_undefined2.ll
+++ b/test/tools/gold/X86/asm_undefined2.ll
@@ -9,10 +9,11 @@
; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
; RUN: --plugin-opt=save-temps \
; RUN: --plugin-opt=thinlto -o %t2 %t.o
-; RUN: llvm-dis < %t.o.5.precodegen.bc | FileCheck %s
+; RUN: llvm-dis < %t.o.5.precodegen.bc | FileCheck --check-prefix=CHECKTHIN %s
; Check that foo is not internalized
; CHECK: define void @foo
+; CHECKTHIN: define dso_local void @foo
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/tools/gold/X86/coff.ll b/test/tools/gold/X86/coff.ll
index 541383ddf51..e3eaa6a928c 100644
--- a/test/tools/gold/X86/coff.ll
+++ b/test/tools/gold/X86/coff.ll
@@ -11,7 +11,7 @@ define void @f() {
ret void
}
-; CHECK: define internal void @g() {
+; CHECK: define internal dso_local void @g() {
define hidden void @g() {
ret void
}
diff --git a/test/tools/gold/X86/common.ll b/test/tools/gold/X86/common.ll
index ca506f6dd2d..5d2c5157f69 100644
--- a/test/tools/gold/X86/common.ll
+++ b/test/tools/gold/X86/common.ll
@@ -46,4 +46,4 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; RUN: llvm-dis %t3.o -o - | FileCheck --check-prefix=MIXED %s
; Mixed ELF and IR. We keep ours as common so the linker will finish the merge.
-; MIXED: @a = common global i16 0, align 8
+; MIXED: @a = common dso_local global i16 0, align 8
diff --git a/test/tools/gold/X86/emit-llvm.ll b/test/tools/gold/X86/emit-llvm.ll
index 70d244c34ec..9aec93a78f0 100644
--- a/test/tools/gold/X86/emit-llvm.ll
+++ b/test/tools/gold/X86/emit-llvm.ll
@@ -48,14 +48,14 @@ target triple = "x86_64-unknown-linux-gnu"
@g8 = external global i32
-; CHECK-DAG: define internal void @f1()
+; CHECK-DAG: define internal dso_local void @f1()
; OPT2-NOT: @f1
define hidden void @f1() {
ret void
}
-; CHECK-DAG: define hidden void @f2()
-; OPT-DAG: define hidden void @f2()
+; CHECK-DAG: define dso_local hidden void @f2()
+; OPT-DAG: define dso_local hidden void @f2()
define hidden void @f2() {
ret void
}
diff --git a/test/tools/gold/X86/global_with_section.ll b/test/tools/gold/X86/global_with_section.ll
index 9023e76a4e6..c8291f8ceae 100644
--- a/test/tools/gold/X86/global_with_section.ll
+++ b/test/tools/gold/X86/global_with_section.ll
@@ -40,16 +40,16 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; We should not internalize @var_with_section due to section
-; CHECK-DAG: @var_with_section = global i32 0, section "some_section"
+; CHECK-DAG: @var_with_section = dso_local global i32 0, section "some_section"
@var_with_section = global i32 0, section "some_section"
; Confirm via a variable with a non-C identifier section that we are getting
; the expected internalization.
-; CHECK-DAG: @var_with_nonC_section = internal global i32 0, section ".nonCsection"
+; CHECK-DAG: @var_with_nonC_section = internal dso_local global i32 0, section ".nonCsection"
@var_with_nonC_section = global i32 0, section ".nonCsection"
; We should not internalize @deadfunc_with_section due to section
-; CHECK-DAG: define void @deadfunc_with_section() section "some_other_section"
+; CHECK-DAG: define dso_local void @deadfunc_with_section() section "some_other_section"
define void @deadfunc_with_section() section "some_other_section" {
call void @deadfunc2_called_from_section()
ret void
@@ -57,7 +57,7 @@ define void @deadfunc_with_section() section "some_other_section" {
; Confirm via a function with a non-C identifier section that we are getting
; the expected internalization.
-; CHECK-DAG: define internal void @deadfunc_with_nonC_section() section ".nonCsection"
+; CHECK-DAG: define internal dso_local void @deadfunc_with_nonC_section() section ".nonCsection"
define void @deadfunc_with_nonC_section() section ".nonCsection" {
call void @deadfunc2_called_from_nonC_section()
ret void
@@ -65,15 +65,15 @@ define void @deadfunc_with_nonC_section() section ".nonCsection" {
; In RegularLTO mode, where we have combined all the IR,
; @deadfunc2_called_from_section can be internalized.
-; CHECK2-REGULARLTO: define internal void @deadfunc2_called_from_section
+; CHECK2-REGULARLTO: define internal dso_local void @deadfunc2_called_from_section
; In ThinLTO mode, we can't internalize it as it needs to be preserved
; (due to the access from @deadfunc_with_section which must be preserved), and
; can't be internalized since the reference is from a different module.
-; CHECK2-THINLTO: define void @deadfunc2_called_from_section
+; CHECK2-THINLTO: define dso_local void @deadfunc2_called_from_section
declare void @deadfunc2_called_from_section()
; Confirm when called from a function with a non-C identifier section that we
; are getting the expected internalization.
-; CHECK2-REGULARLTO: define internal void @deadfunc2_called_from_nonC_section
-; CHECK2-THINLTO: define internal void @deadfunc2_called_from_nonC_section
+; CHECK2-REGULARLTO: define internal dso_local void @deadfunc2_called_from_nonC_section
+; CHECK2-THINLTO: define internal dso_local void @deadfunc2_called_from_nonC_section
declare void @deadfunc2_called_from_nonC_section()
diff --git a/test/tools/gold/X86/parallel.ll b/test/tools/gold/X86/parallel.ll
index 4de694c94c8..7d0e405d5d6 100644
--- a/test/tools/gold/X86/parallel.ll
+++ b/test/tools/gold/X86/parallel.ll
@@ -9,8 +9,8 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-; CHECK-BC0: define void @foo
-; CHECK-BC0: declare void @bar
+; CHECK-BC0: define dso_local void @foo
+; CHECK-BC0: declare dso_local void @bar
; CHECK0-NOT: bar
; CHECK0: T foo
; CHECK0-NOT: bar
@@ -19,8 +19,8 @@ define void @foo() {
ret void
}
-; CHECK-BC1: declare void @foo
-; CHECK-BC1: define void @bar
+; CHECK-BC1: declare dso_local void @foo
+; CHECK-BC1: define dso_local void @bar
; CHECK1-NOT: foo
; CHECK1: T bar
; CHECK1-NOT: foo
diff --git a/test/tools/gold/X86/thinlto_linkonceresolution.ll b/test/tools/gold/X86/thinlto_linkonceresolution.ll
index bf2d22a9ef7..c56d6ce2857 100644
--- a/test/tools/gold/X86/thinlto_linkonceresolution.ll
+++ b/test/tools/gold/X86/thinlto_linkonceresolution.ll
@@ -21,7 +21,7 @@
; confirm the weak linkage directly in the saved opt bitcode files.
; CHECK-NOT: U f
; OPT-NOT: @f()
-; OPT2: define weak_odr hidden void @f()
+; OPT2: define weak_odr dso_local hidden void @f()
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/tools/gold/X86/thinlto_weak_library.ll b/test/tools/gold/X86/thinlto_weak_library.ll
index 6a04fc0db0e..9e7b4794c65 100644
--- a/test/tools/gold/X86/thinlto_weak_library.ll
+++ b/test/tools/gold/X86/thinlto_weak_library.ll
@@ -24,7 +24,7 @@
; copy of f() (and didn't simply convert to available_externally, which
; would incorrectly enable inlining).
; RUN: llvm-dis %t2.o.1.promote.bc -o - | FileCheck %s
-; CHECK: declare i32 @f()
+; CHECK: declare dso_local i32 @f()
; ModuleID = 'thinlto_weak_library.c'
source_filename = "thinlto_weak_library.c"
diff --git a/test/tools/gold/X86/visibility.ll b/test/tools/gold/X86/visibility.ll
index 1c70ebf5c46..61f565d2da4 100644
--- a/test/tools/gold/X86/visibility.ll
+++ b/test/tools/gold/X86/visibility.ll
@@ -17,7 +17,7 @@
; CHECK-NEXT: STV_PROTECTED
; CHECK-NEXT: ]
-; IR: define void @foo
+; IR: define dso_local void @foo
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/tools/llvm-ar/default-add.test b/test/tools/llvm-ar/default-add.test
index 88719e4efce..68e41c24910 100644
--- a/test/tools/llvm-ar/default-add.test
+++ b/test/tools/llvm-ar/default-add.test
@@ -4,7 +4,8 @@ RUN: yaml2obj %S/Inputs/coff.yaml -o %t-coff.o
RUN: rm -f %t.ar
RUN: llvm-ar crs %t.ar %t-macho.o
RUN: grep -q __.SYMDEF %t.ar
-RUN: llvm-ar crs %t.ar %t-coff.o
+Test that an option string prefixed by a dash works.
+RUN: llvm-ar -crs %t.ar %t-coff.o
RUN: grep -q __.SYMDEF %t.ar
RUN: rm -f %t.ar
diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s
new file mode 100644
index 00000000000..f8cfcb8d15c
--- /dev/null
+++ b/test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s
@@ -0,0 +1,195 @@
+# Source (tiny.cc):
+# void a() {}
+# void b() {}
+# int main(int argc, char** argv) {
+# void(*ptr)();
+# if (argc == 1)
+# ptr = &a;
+# else
+# ptr = &b;
+# ptr();
+# }
+# Compile with (output is in tiny.s.0):
+# clang++ -flto -fsanitize=cfi -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt
+# clang++ tiny.o -o tiny -flto -fuse-ld=gold -Wl,-plugin-opt,save-temps
+# clang++ -fsanitize=cfi -flto -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt
+# llvm-lto2 run @tiny.resolution.txt -o tiny.s -filetype=asm
+
+ .text
+ .file "ld-temp.o"
+ .p2align 4, 0x90
+ .type _Z1av.cfi,@function
+_Z1av.cfi:
+.Lfunc_begin0:
+ .file 1 "tiny.cc"
+ .loc 1 1 0
+ .cfi_startproc
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+.Ltmp0:
+ .loc 1 1 11 prologue_end
+ popq %rbp
+ retq
+.Ltmp1:
+.Lfunc_end0:
+ .size _Z1av.cfi, .Lfunc_end0-_Z1av.cfi
+ .cfi_endproc
+
+ .p2align 4, 0x90
+ .type _Z1bv.cfi,@function
+_Z1bv.cfi:
+.Lfunc_begin1:
+ .loc 1 2 0
+ .cfi_startproc
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+.Ltmp2:
+ .loc 1 2 11 prologue_end
+ popq %rbp
+ retq
+.Ltmp3:
+.Lfunc_end1:
+ .size _Z1bv.cfi, .Lfunc_end1-_Z1bv.cfi
+ .cfi_endproc
+
+ .hidden main
+ .globl main
+ .p2align 4, 0x90
+ .type main,@function
+main:
+.Lfunc_begin2:
+ .loc 1 4 0
+ .cfi_startproc
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ subq $32, %rsp
+ movl $0, -8(%rbp)
+ movl %edi, -4(%rbp)
+ movq %rsi, -24(%rbp)
+.Ltmp4:
+ .loc 1 6 12 prologue_end
+ cmpl $1, -4(%rbp)
+ .loc 1 6 7 is_stmt 0
+ jne .LBB2_2
+ .loc 1 0 7
+ leaq _Z1av(%rip), %rax
+ .loc 1 7 9 is_stmt 1
+ movq %rax, -16(%rbp)
+ .loc 1 7 5 is_stmt 0
+ jmp .LBB2_3
+.LBB2_2:
+ .loc 1 0 5
+ leaq _Z1bv(%rip), %rax
+ .loc 1 9 9 is_stmt 1
+ movq %rax, -16(%rbp)
+.LBB2_3:
+ .loc 1 0 9 is_stmt 0
+ leaq .L.cfi.jumptable(%rip), %rcx
+ .loc 1 11 3 is_stmt 1
+ movq -16(%rbp), %rax
+ movq %rax, %rdx
+ subq %rcx, %rdx
+ movq %rdx, %rcx
+ shrq $3, %rcx
+ shlq $61, %rdx
+ orq %rcx, %rdx
+ cmpq $1, %rdx
+ jbe .LBB2_5
+ ud2
+.LBB2_5:
+ callq *%rax
+ .loc 1 12 1
+ movl -8(%rbp), %eax
+ addq $32, %rsp
+ popq %rbp
+ retq
+.Ltmp5:
+.Lfunc_end2:
+ .size main, .Lfunc_end2-main
+ .cfi_endproc
+
+ .p2align 3, 0x90
+ .type .L.cfi.jumptable,@function
+.L.cfi.jumptable:
+.Lfunc_begin3:
+ .cfi_startproc
+ #APP
+ jmp _Z1av.cfi@PLT
+ int3
+ int3
+ int3
+ jmp _Z1bv.cfi@PLT
+ int3
+ int3
+ int3
+
+ #NO_APP
+.Lfunc_end3:
+ .size .L.cfi.jumptable, .Lfunc_end3-.L.cfi.jumptable
+ .cfi_endproc
+
+ .section .debug_str,"MS",@progbits,1
+.Linfo_string0:
+ .asciz "clang version 6.0.0 (trunk 316774)"
+.Linfo_string1:
+ .asciz "tiny.cc"
+.Linfo_string2:
+ .asciz ""
+ .section .debug_abbrev,"",@progbits
+ .byte 1
+ .byte 17
+ .byte 0
+ .byte 37
+ .byte 14
+ .byte 19
+ .byte 5
+ .byte 3
+ .byte 14
+ .byte 16
+ .byte 23
+ .byte 27
+ .byte 14
+ .byte 17
+ .byte 1
+ .byte 18
+ .byte 6
+ .byte 0
+ .byte 0
+ .byte 0
+ .section .debug_info,"",@progbits
+.Lcu_begin0:
+ .long 38
+ .short 4
+ .long .debug_abbrev
+ .byte 8
+ .byte 1
+ .long .Linfo_string0
+ .short 4
+ .long .Linfo_string1
+ .long .Lline_table_start0
+ .long .Linfo_string2
+ .quad .Lfunc_begin0
+ .long .Lfunc_end2-.Lfunc_begin0
+ .section .debug_ranges,"",@progbits
+ .section .debug_macinfo,"",@progbits
+.Lcu_macro_begin0:
+ .byte 0
+
+ .type _Z1av,@function
+_Z1av = .L.cfi.jumptable
+ .type _Z1bv,@function
+_Z1bv = .L.cfi.jumptable+8
+ .ident "clang version 6.0.0 (trunk 316774)"
+ .section ".note.GNU-stack","",@progbits
+ .section .debug_line,"",@progbits
+.Lline_table_start0:
+
diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s
new file mode 100644
index 00000000000..7b5ca07d7e4
--- /dev/null
+++ b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s
@@ -0,0 +1,380 @@
+# Source (tiny.cc):
+# void a() {}
+# void b() {}
+# int main(int argc, char** argv) {
+# void(*ptr)();
+# if (argc == 1)
+# ptr = &a;
+# else
+# ptr = &b;
+# ptr();
+# }
+# Compile with:
+# clang++ -g tiny.cc -S -o tiny.s
+
+ .text
+ .file "tiny.cc"
+ .globl _Z1av # -- Begin function _Z1av
+ .p2align 4, 0x90
+ .type _Z1av,@function
+_Z1av: # @_Z1av
+.Lfunc_begin0:
+ .file 1 "tiny.cc"
+ .loc 1 1 0 # tiny.cc:1:0
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+.Ltmp0:
+ .loc 1 1 11 prologue_end # tiny.cc:1:11
+ popq %rbp
+ .cfi_def_cfa %rsp, 8
+ retq
+.Ltmp1:
+.Lfunc_end0:
+ .size _Z1av, .Lfunc_end0-_Z1av
+ .cfi_endproc
+ # -- End function
+ .globl _Z1bv # -- Begin function _Z1bv
+ .p2align 4, 0x90
+ .type _Z1bv,@function
+_Z1bv: # @_Z1bv
+.Lfunc_begin1:
+ .loc 1 2 0 # tiny.cc:2:0
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+.Ltmp2:
+ .loc 1 2 11 prologue_end # tiny.cc:2:11
+ popq %rbp
+ .cfi_def_cfa %rsp, 8
+ retq
+.Ltmp3:
+.Lfunc_end1:
+ .size _Z1bv, .Lfunc_end1-_Z1bv
+ .cfi_endproc
+ # -- End function
+ .globl main # -- Begin function main
+ .p2align 4, 0x90
+ .type main,@function
+main: # @main
+.Lfunc_begin2:
+ .loc 1 4 0 # tiny.cc:4:0
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ subq $32, %rsp
+ movl $0, -4(%rbp)
+ movl %edi, -8(%rbp)
+ movq %rsi, -16(%rbp)
+.Ltmp4:
+ .loc 1 6 12 prologue_end # tiny.cc:6:12
+ cmpl $1, -8(%rbp)
+.Ltmp5:
+ .loc 1 6 7 is_stmt 0 # tiny.cc:6:7
+ jne .LBB2_2
+# BB#1:
+ .loc 1 0 7 # tiny.cc:0:7
+ movabsq $_Z1av, %rax
+.Ltmp6:
+ .loc 1 7 9 is_stmt 1 # tiny.cc:7:9
+ movq %rax, -24(%rbp)
+ .loc 1 7 5 is_stmt 0 # tiny.cc:7:5
+ jmp .LBB2_3
+.LBB2_2:
+ .loc 1 0 5 # tiny.cc:0:5
+ movabsq $_Z1bv, %rax
+ .loc 1 9 9 is_stmt 1 # tiny.cc:9:9
+ movq %rax, -24(%rbp)
+.Ltmp7:
+.LBB2_3:
+ .loc 1 11 3 # tiny.cc:11:3
+ callq *-24(%rbp)
+ .loc 1 12 1 # tiny.cc:12:1
+ movl -4(%rbp), %eax
+ addq $32, %rsp
+ popq %rbp
+ .cfi_def_cfa %rsp, 8
+ retq
+.Ltmp8:
+.Lfunc_end2:
+ .size main, .Lfunc_end2-main
+ .cfi_endproc
+ # -- End function
+ .section .debug_str,"MS",@progbits,1
+.Linfo_string0:
+ .asciz "clang version 6.0.0 (trunk 317104)" # string offset=0
+.Linfo_string1:
+ .asciz "tiny.cc" # string offset=35
+.Linfo_string2:
+ .asciz "/tmp/a/b" # string offset=43
+.Linfo_string3:
+ .asciz "_Z1av" # string offset=52
+.Linfo_string4:
+ .asciz "a" # string offset=58
+.Linfo_string5:
+ .asciz "_Z1bv" # string offset=60
+.Linfo_string6:
+ .asciz "b" # string offset=66
+.Linfo_string7:
+ .asciz "main" # string offset=68
+.Linfo_string8:
+ .asciz "int" # string offset=73
+.Linfo_string9:
+ .asciz "argc" # string offset=77
+.Linfo_string10:
+ .asciz "argv" # string offset=82
+.Linfo_string11:
+ .asciz "char" # string offset=87
+.Linfo_string12:
+ .asciz "ptr" # string offset=92
+ .section .debug_abbrev,"",@progbits
+ .byte 1 # Abbreviation Code
+ .byte 17 # DW_TAG_compile_unit
+ .byte 1 # DW_CHILDREN_yes
+ .byte 37 # DW_AT_producer
+ .byte 14 # DW_FORM_strp
+ .byte 19 # DW_AT_language
+ .byte 5 # DW_FORM_data2
+ .byte 3 # DW_AT_name
+ .byte 14 # DW_FORM_strp
+ .byte 16 # DW_AT_stmt_list
+ .byte 23 # DW_FORM_sec_offset
+ .byte 27 # DW_AT_comp_dir
+ .byte 14 # DW_FORM_strp
+ .ascii "\264B" # DW_AT_GNU_pubnames
+ .byte 25 # DW_FORM_flag_present
+ .byte 17 # DW_AT_low_pc
+ .byte 1 # DW_FORM_addr
+ .byte 18 # DW_AT_high_pc
+ .byte 6 # DW_FORM_data4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 2 # Abbreviation Code
+ .byte 46 # DW_TAG_subprogram
+ .byte 0 # DW_CHILDREN_no
+ .byte 17 # DW_AT_low_pc
+ .byte 1 # DW_FORM_addr
+ .byte 18 # DW_AT_high_pc
+ .byte 6 # DW_FORM_data4
+ .byte 64 # DW_AT_frame_base
+ .byte 24 # DW_FORM_exprloc
+ .byte 110 # DW_AT_linkage_name
+ .byte 14 # DW_FORM_strp
+ .byte 3 # DW_AT_name
+ .byte 14 # DW_FORM_strp
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 63 # DW_AT_external
+ .byte 25 # DW_FORM_flag_present
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 3 # Abbreviation Code
+ .byte 46 # DW_TAG_subprogram
+ .byte 1 # DW_CHILDREN_yes
+ .byte 17 # DW_AT_low_pc
+ .byte 1 # DW_FORM_addr
+ .byte 18 # DW_AT_high_pc
+ .byte 6 # DW_FORM_data4
+ .byte 64 # DW_AT_frame_base
+ .byte 24 # DW_FORM_exprloc
+ .byte 3 # DW_AT_name
+ .byte 14 # DW_FORM_strp
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 63 # DW_AT_external
+ .byte 25 # DW_FORM_flag_present
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 4 # Abbreviation Code
+ .byte 5 # DW_TAG_formal_parameter
+ .byte 0 # DW_CHILDREN_no
+ .byte 2 # DW_AT_location
+ .byte 24 # DW_FORM_exprloc
+ .byte 3 # DW_AT_name
+ .byte 14 # DW_FORM_strp
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 5 # Abbreviation Code
+ .byte 52 # DW_TAG_variable
+ .byte 0 # DW_CHILDREN_no
+ .byte 2 # DW_AT_location
+ .byte 24 # DW_FORM_exprloc
+ .byte 3 # DW_AT_name
+ .byte 14 # DW_FORM_strp
+ .byte 58 # DW_AT_decl_file
+ .byte 11 # DW_FORM_data1
+ .byte 59 # DW_AT_decl_line
+ .byte 11 # DW_FORM_data1
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 6 # Abbreviation Code
+ .byte 36 # DW_TAG_base_type
+ .byte 0 # DW_CHILDREN_no
+ .byte 3 # DW_AT_name
+ .byte 14 # DW_FORM_strp
+ .byte 62 # DW_AT_encoding
+ .byte 11 # DW_FORM_data1
+ .byte 11 # DW_AT_byte_size
+ .byte 11 # DW_FORM_data1
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 7 # Abbreviation Code
+ .byte 15 # DW_TAG_pointer_type
+ .byte 0 # DW_CHILDREN_no
+ .byte 73 # DW_AT_type
+ .byte 19 # DW_FORM_ref4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 8 # Abbreviation Code
+ .byte 21 # DW_TAG_subroutine_type
+ .byte 0 # DW_CHILDREN_no
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 0 # EOM(3)
+ .section .debug_info,"",@progbits
+.Lcu_begin0:
+ .long 187 # Length of Unit
+ .short 4 # DWARF version number
+ .long .debug_abbrev # Offset Into Abbrev. Section
+ .byte 8 # Address Size (in bytes)
+ .byte 1 # Abbrev [1] 0xb:0xb4 DW_TAG_compile_unit
+ .long .Linfo_string0 # DW_AT_producer
+ .short 4 # DW_AT_language
+ .long .Linfo_string1 # DW_AT_name
+ .long .Lline_table_start0 # DW_AT_stmt_list
+ .long .Linfo_string2 # DW_AT_comp_dir
+ # DW_AT_GNU_pubnames
+ .quad .Lfunc_begin0 # DW_AT_low_pc
+ .long .Lfunc_end2-.Lfunc_begin0 # DW_AT_high_pc
+ .byte 2 # Abbrev [2] 0x2a:0x19 DW_TAG_subprogram
+ .quad .Lfunc_begin0 # DW_AT_low_pc
+ .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc
+ .byte 1 # DW_AT_frame_base
+ .byte 86
+ .long .Linfo_string3 # DW_AT_linkage_name
+ .long .Linfo_string4 # DW_AT_name
+ .byte 1 # DW_AT_decl_file
+ .byte 1 # DW_AT_decl_line
+ # DW_AT_external
+ .byte 2 # Abbrev [2] 0x43:0x19 DW_TAG_subprogram
+ .quad .Lfunc_begin1 # DW_AT_low_pc
+ .long .Lfunc_end1-.Lfunc_begin1 # DW_AT_high_pc
+ .byte 1 # DW_AT_frame_base
+ .byte 86
+ .long .Linfo_string5 # DW_AT_linkage_name
+ .long .Linfo_string6 # DW_AT_name
+ .byte 1 # DW_AT_decl_file
+ .byte 2 # DW_AT_decl_line
+ # DW_AT_external
+ .byte 3 # Abbrev [3] 0x5c:0x44 DW_TAG_subprogram
+ .quad .Lfunc_begin2 # DW_AT_low_pc
+ .long .Lfunc_end2-.Lfunc_begin2 # DW_AT_high_pc
+ .byte 1 # DW_AT_frame_base
+ .byte 86
+ .long .Linfo_string7 # DW_AT_name
+ .byte 1 # DW_AT_decl_file
+ .byte 4 # DW_AT_decl_line
+ .long 160 # DW_AT_type
+ # DW_AT_external
+ .byte 4 # Abbrev [4] 0x75:0xe DW_TAG_formal_parameter
+ .byte 2 # DW_AT_location
+ .byte 145
+ .byte 120
+ .long .Linfo_string9 # DW_AT_name
+ .byte 1 # DW_AT_decl_file
+ .byte 4 # DW_AT_decl_line
+ .long 160 # DW_AT_type
+ .byte 4 # Abbrev [4] 0x83:0xe DW_TAG_formal_parameter
+ .byte 2 # DW_AT_location
+ .byte 145
+ .byte 112
+ .long .Linfo_string10 # DW_AT_name
+ .byte 1 # DW_AT_decl_file
+ .byte 4 # DW_AT_decl_line
+ .long 167 # DW_AT_type
+ .byte 5 # Abbrev [5] 0x91:0xe DW_TAG_variable
+ .byte 2 # DW_AT_location
+ .byte 145
+ .byte 104
+ .long .Linfo_string12 # DW_AT_name
+ .byte 1 # DW_AT_decl_file
+ .byte 5 # DW_AT_decl_line
+ .long 184 # DW_AT_type
+ .byte 0 # End Of Children Mark
+ .byte 6 # Abbrev [6] 0xa0:0x7 DW_TAG_base_type
+ .long .Linfo_string8 # DW_AT_name
+ .byte 5 # DW_AT_encoding
+ .byte 4 # DW_AT_byte_size
+ .byte 7 # Abbrev [7] 0xa7:0x5 DW_TAG_pointer_type
+ .long 172 # DW_AT_type
+ .byte 7 # Abbrev [7] 0xac:0x5 DW_TAG_pointer_type
+ .long 177 # DW_AT_type
+ .byte 6 # Abbrev [6] 0xb1:0x7 DW_TAG_base_type
+ .long .Linfo_string11 # DW_AT_name
+ .byte 6 # DW_AT_encoding
+ .byte 1 # DW_AT_byte_size
+ .byte 7 # Abbrev [7] 0xb8:0x5 DW_TAG_pointer_type
+ .long 189 # DW_AT_type
+ .byte 8 # Abbrev [8] 0xbd:0x1 DW_TAG_subroutine_type
+ .byte 0 # End Of Children Mark
+ .section .debug_ranges,"",@progbits
+ .section .debug_macinfo,"",@progbits
+.Lcu_macro_begin0:
+ .byte 0 # End Of Macro List Mark
+ .section .debug_pubnames,"",@progbits
+ .long .LpubNames_end0-.LpubNames_begin0 # Length of Public Names Info
+.LpubNames_begin0:
+ .short 2 # DWARF Version
+ .long .Lcu_begin0 # Offset of Compilation Unit Info
+ .long 191 # Compilation Unit Length
+ .long 42 # DIE offset
+ .asciz "a" # External Name
+ .long 67 # DIE offset
+ .asciz "b" # External Name
+ .long 92 # DIE offset
+ .asciz "main" # External Name
+ .long 0 # End Mark
+.LpubNames_end0:
+ .section .debug_pubtypes,"",@progbits
+ .long .LpubTypes_end0-.LpubTypes_begin0 # Length of Public Types Info
+.LpubTypes_begin0:
+ .short 2 # DWARF Version
+ .long .Lcu_begin0 # Offset of Compilation Unit Info
+ .long 191 # Compilation Unit Length
+ .long 160 # DIE offset
+ .asciz "int" # External Name
+ .long 177 # DIE offset
+ .asciz "char" # External Name
+ .long 0 # End Mark
+.LpubTypes_end0:
+
+ .ident "clang version 6.0.0 (trunk 317104)"
+ .section ".note.GNU-stack","",@progbits
+ .section .debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s
new file mode 100644
index 00000000000..155f5978b46
--- /dev/null
+++ b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s
@@ -0,0 +1,159 @@
+# Source (tiny.cc):
+# void a() {}
+# void b() {}
+# int main(int argc, char** argv) {
+# void(*ptr)();
+# if (argc == 1)
+# ptr = &a;
+# else
+# ptr = &b;
+# ptr();
+# }
+# Compile with:
+# clang++ -gmlt tiny.cc -S -o tiny.s
+
+ .text
+ .file "tiny.cc"
+ .globl _Z1av # -- Begin function _Z1av
+ .p2align 4, 0x90
+ .type _Z1av,@function
+_Z1av: # @_Z1av
+.Lfunc_begin0:
+ .file 1 "tiny.cc"
+ .loc 1 1 0 # tiny.cc:1:0
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+.Ltmp0:
+ .loc 1 1 11 prologue_end # tiny.cc:1:11
+ popq %rbp
+ retq
+.Ltmp1:
+.Lfunc_end0:
+ .size _Z1av, .Lfunc_end0-_Z1av
+ .cfi_endproc
+ # -- End function
+ .globl _Z1bv # -- Begin function _Z1bv
+ .p2align 4, 0x90
+ .type _Z1bv,@function
+_Z1bv: # @_Z1bv
+.Lfunc_begin1:
+ .loc 1 2 0 # tiny.cc:2:0
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+.Ltmp2:
+ .loc 1 2 11 prologue_end # tiny.cc:2:11
+ popq %rbp
+ retq
+.Ltmp3:
+.Lfunc_end1:
+ .size _Z1bv, .Lfunc_end1-_Z1bv
+ .cfi_endproc
+ # -- End function
+ .globl main # -- Begin function main
+ .p2align 4, 0x90
+ .type main,@function
+main: # @main
+.Lfunc_begin2:
+ .loc 1 4 0 # tiny.cc:4:0
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ subq $32, %rsp
+ movl $0, -4(%rbp)
+ movl %edi, -8(%rbp)
+ movq %rsi, -16(%rbp)
+.Ltmp4:
+ .loc 1 6 12 prologue_end # tiny.cc:6:12
+ cmpl $1, -8(%rbp)
+ .loc 1 6 7 is_stmt 0 # tiny.cc:6:7
+ jne .LBB2_2
+# BB#1:
+ .loc 1 0 7 # tiny.cc:0:7
+ movabsq $_Z1av, %rax
+ .loc 1 7 9 is_stmt 1 # tiny.cc:7:9
+ movq %rax, -24(%rbp)
+ .loc 1 7 5 is_stmt 0 # tiny.cc:7:5
+ jmp .LBB2_3
+.LBB2_2:
+ .loc 1 0 5 # tiny.cc:0:5
+ movabsq $_Z1bv, %rax
+ .loc 1 9 9 is_stmt 1 # tiny.cc:9:9
+ movq %rax, -24(%rbp)
+.LBB2_3:
+ .loc 1 11 3 # tiny.cc:11:3
+ callq *-24(%rbp)
+ .loc 1 12 1 # tiny.cc:12:1
+ movl -4(%rbp), %eax
+ addq $32, %rsp
+ popq %rbp
+ retq
+.Ltmp5:
+.Lfunc_end2:
+ .size main, .Lfunc_end2-main
+ .cfi_endproc
+ # -- End function
+ .section .debug_str,"MS",@progbits,1
+.Linfo_string0:
+ .asciz "clang version 6.0.0 (trunk 316774)" # string offset=0
+.Linfo_string1:
+ .asciz "tiny.cc" # string offset=35
+.Linfo_string2:
+ .asciz "/tmp/a/b" # string offset=43
+ .section .debug_abbrev,"",@progbits
+ .byte 1 # Abbreviation Code
+ .byte 17 # DW_TAG_compile_unit
+ .byte 0 # DW_CHILDREN_no
+ .byte 37 # DW_AT_producer
+ .byte 14 # DW_FORM_strp
+ .byte 19 # DW_AT_language
+ .byte 5 # DW_FORM_data2
+ .byte 3 # DW_AT_name
+ .byte 14 # DW_FORM_strp
+ .byte 16 # DW_AT_stmt_list
+ .byte 23 # DW_FORM_sec_offset
+ .byte 27 # DW_AT_comp_dir
+ .byte 14 # DW_FORM_strp
+ .byte 17 # DW_AT_low_pc
+ .byte 1 # DW_FORM_addr
+ .byte 18 # DW_AT_high_pc
+ .byte 6 # DW_FORM_data4
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 0 # EOM(3)
+ .section .debug_info,"",@progbits
+.Lcu_begin0:
+ .long 38 # Length of Unit
+ .short 4 # DWARF version number
+ .long .debug_abbrev # Offset Into Abbrev. Section
+ .byte 8 # Address Size (in bytes)
+ .byte 1 # Abbrev [1] 0xb:0x1f DW_TAG_compile_unit
+ .long .Linfo_string0 # DW_AT_producer
+ .short 4 # DW_AT_language
+ .long .Linfo_string1 # DW_AT_name
+ .long .Lline_table_start0 # DW_AT_stmt_list
+ .long .Linfo_string2 # DW_AT_comp_dir
+ .quad .Lfunc_begin0 # DW_AT_low_pc
+ .long .Lfunc_end2-.Lfunc_begin0 # DW_AT_high_pc
+ .section .debug_ranges,"",@progbits
+ .section .debug_macinfo,"",@progbits
+.Lcu_macro_begin0:
+ .byte 0 # End Of Macro List Mark
+
+ .ident "clang version 6.0.0 (trunk 316774)"
+ .section ".note.GNU-stack","",@progbits
+ .section .debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s
new file mode 100644
index 00000000000..2d3cf2f484e
--- /dev/null
+++ b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s
@@ -0,0 +1,87 @@
+# Source (tiny.cc):
+# void a() {}
+# void b() {}
+# int main(int argc, char** argv) {
+# void(*ptr)();
+# if (argc == 1)
+# ptr = &a;
+# else
+# ptr = &b;
+# ptr();
+# }
+# Compile with:
+# clang++ tiny.cc -S -o tiny.s
+
+ .text
+ .file "tiny.cc"
+ .globl _Z1av # -- Begin function _Z1av
+ .p2align 4, 0x90
+ .type _Z1av,@function
+_Z1av: # @_Z1av
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ popq %rbp
+ retq
+.Lfunc_end0:
+ .size _Z1av, .Lfunc_end0-_Z1av
+ .cfi_endproc
+ # -- End function
+ .globl _Z1bv # -- Begin function _Z1bv
+ .p2align 4, 0x90
+ .type _Z1bv,@function
+_Z1bv: # @_Z1bv
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ popq %rbp
+ retq
+.Lfunc_end1:
+ .size _Z1bv, .Lfunc_end1-_Z1bv
+ .cfi_endproc
+ # -- End function
+ .globl main # -- Begin function main
+ .p2align 4, 0x90
+ .type main,@function
+main: # @main
+ .cfi_startproc
+# BB#0:
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ subq $32, %rsp
+ movl $0, -4(%rbp)
+ movl %edi, -8(%rbp)
+ movq %rsi, -16(%rbp)
+ cmpl $1, -8(%rbp)
+ jne .LBB2_2
+# BB#1:
+ movabsq $_Z1av, %rax
+ movq %rax, -24(%rbp)
+ jmp .LBB2_3
+.LBB2_2:
+ movabsq $_Z1bv, %rax
+ movq %rax, -24(%rbp)
+.LBB2_3:
+ callq *-24(%rbp)
+ movl -4(%rbp), %eax
+ addq $32, %rsp
+ popq %rbp
+ retq
+.Lfunc_end2:
+ .size main, .Lfunc_end2-main
+ .cfi_endproc
+ # -- End function
+
+ .ident "clang version 6.0.0 (trunk 316774)"
+ .section ".note.GNU-stack","",@progbits
diff --git a/test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s b/test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s
new file mode 100644
index 00000000000..fbcfcc2a7cc
--- /dev/null
+++ b/test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc %S/Inputs/unprotected-lineinfo.s -filetype obj \
+# RUN: -triple x86_64-linux-elf -o %t.o
+# RUN: echo "src:*tiny*" > %t.blacklist.txt
+# RUN: llvm-cfi-verify %t.o %t.blacklist.txt | FileCheck %s
+
+# CHECK-LABEL: U
+# CHECK-NEXT: tiny.cc:11
+# CHECK-NEXT: BLACKLIST MATCH, 'src'
+# CHECK-NEXT: ====> Expected Unprotected
+
+# CHECK: Expected Protected: 0 (0.00%)
+# CHECK: Unexpected Protected: 0 (0.00%)
+# CHECK: Expected Unprotected: 1 (100.00%)
+# CHECK: Unexpected Unprotected (BAD): 0 (0.00%)
+
+# Source: (blacklist.txt):
+# src:*tiny*
diff --git a/test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s b/test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s
new file mode 100644
index 00000000000..3ea829395c4
--- /dev/null
+++ b/test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc %S/Inputs/unprotected-fullinfo.s -filetype obj \
+# RUN: -triple x86_64-linux-elf -o %t.o
+# RUN: echo "fun:*main*" > %t.blacklist.txt
+# RUN: llvm-cfi-verify %t.o %t.blacklist.txt | FileCheck %s
+
+# CHECK-LABEL: U
+# CHECK-NEXT: tiny.cc:11
+# CHECK-NEXT: BLACKLIST MATCH, 'fun'
+# CHECK-NEXT: ====> Expected Unprotected
+
+# CHECK: Expected Protected: 0 (0.00%)
+# CHECK: Unexpected Protected: 0 (0.00%)
+# CHECK: Expected Unprotected: 1 (100.00%)
+# CHECK: Unexpected Unprotected (BAD): 0 (0.00%)
+
+# Source: (blacklist.txt):
+# fun:*main*
diff --git a/test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s b/test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s
new file mode 100644
index 00000000000..c6ddf2b5d11
--- /dev/null
+++ b/test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc %S/Inputs/protected-lineinfo.s -filetype obj \
+# RUN: -triple x86_64-linux-elf -o %t.o
+# RUN: echo "src:*tiny*" > %t.blacklist.txt
+# RUN: llvm-cfi-verify %t.o %t.blacklist.txt | FileCheck %s
+
+# CHECK-LABEL: P
+# CHECK-NEXT: tiny.cc:11
+# CHECK-NEXT: BLACKLIST MATCH, 'src'
+# CHECK-NEXT: ====> Unexpected Protected
+
+# CHECK: Expected Protected: 0 (0.00%)
+# CHECK: Unexpected Protected: 1 (100.00%)
+# CHECK: Expected Unprotected: 0 (0.00%)
+# CHECK: Unexpected Unprotected (BAD): 0 (0.00%)
+
+# Source: (blacklist.txt):
+# src:*tiny*
diff --git a/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s b/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s
index bf1d87a2eb8..e9b873471cb 100644
--- a/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s
+++ b/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s
@@ -10,7 +10,10 @@
# reporting of the cfi-verify program. It should only find a single indirect CF
# instruction at `tiny.cc:11` (see protected-lineinfo.s for the source).
-# CHECK: Unprotected: 0 (0.00%), Protected: 1 (100.00%)
+# CHECK: Expected Protected: 1 (100.00%)
+# CHECK: Unexpected Protected: 0 (0.00%)
+# CHECK: Expected Unprotected: 0 (0.00%)
+# CHECK: Unexpected Unprotected (BAD): 0 (0.00%)
.text
.file "ld-temp.o"
diff --git a/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s
index e3bb0f7af46..8eaf2e5e725 100644
--- a/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s
+++ b/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s
@@ -1,203 +1,11 @@
-# RUN: llvm-mc %s -filetype obj -triple x86_64-linux-elf -o %t.o
+# RUN: llvm-mc %S/Inputs/protected-lineinfo.s -filetype obj \
+# RUN: -triple x86_64-linux-elf -o %t.o
# RUN: llvm-cfi-verify %t.o | FileCheck %s
# CHECK-LABEL: P
# CHECK-NEXT: tiny.cc:11
-# CHECK: Unprotected: 0 (0.00%), Protected: 1 (100.00%)
-
-# Source (tiny.cc):
-# void a() {}
-# void b() {}
-# int main(int argc, char** argv) {
-# void(*ptr)();
-# if (argc == 1)
-# ptr = &a;
-# else
-# ptr = &b;
-# ptr();
-# }
-# Compile with (output is in tiny.s.0):
-# clang++ -flto -fsanitize=cfi -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt
-# clang++ tiny.o -o tiny -flto -fuse-ld=gold -Wl,-plugin-opt,save-temps
-# clang++ -fsanitize=cfi -flto -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt
-# llvm-lto2 run @tiny.resolution.txt -o tiny.s -filetype=asm
-
- .text
- .file "ld-temp.o"
- .p2align 4, 0x90
- .type _Z1av.cfi,@function
-_Z1av.cfi:
-.Lfunc_begin0:
- .file 1 "tiny.cc"
- .loc 1 1 0
- .cfi_startproc
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
-.Ltmp0:
- .loc 1 1 11 prologue_end
- popq %rbp
- retq
-.Ltmp1:
-.Lfunc_end0:
- .size _Z1av.cfi, .Lfunc_end0-_Z1av.cfi
- .cfi_endproc
-
- .p2align 4, 0x90
- .type _Z1bv.cfi,@function
-_Z1bv.cfi:
-.Lfunc_begin1:
- .loc 1 2 0
- .cfi_startproc
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
-.Ltmp2:
- .loc 1 2 11 prologue_end
- popq %rbp
- retq
-.Ltmp3:
-.Lfunc_end1:
- .size _Z1bv.cfi, .Lfunc_end1-_Z1bv.cfi
- .cfi_endproc
-
- .hidden main
- .globl main
- .p2align 4, 0x90
- .type main,@function
-main:
-.Lfunc_begin2:
- .loc 1 4 0
- .cfi_startproc
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
- subq $32, %rsp
- movl $0, -8(%rbp)
- movl %edi, -4(%rbp)
- movq %rsi, -24(%rbp)
-.Ltmp4:
- .loc 1 6 12 prologue_end
- cmpl $1, -4(%rbp)
- .loc 1 6 7 is_stmt 0
- jne .LBB2_2
- .loc 1 0 7
- leaq _Z1av(%rip), %rax
- .loc 1 7 9 is_stmt 1
- movq %rax, -16(%rbp)
- .loc 1 7 5 is_stmt 0
- jmp .LBB2_3
-.LBB2_2:
- .loc 1 0 5
- leaq _Z1bv(%rip), %rax
- .loc 1 9 9 is_stmt 1
- movq %rax, -16(%rbp)
-.LBB2_3:
- .loc 1 0 9 is_stmt 0
- leaq .L.cfi.jumptable(%rip), %rcx
- .loc 1 11 3 is_stmt 1
- movq -16(%rbp), %rax
- movq %rax, %rdx
- subq %rcx, %rdx
- movq %rdx, %rcx
- shrq $3, %rcx
- shlq $61, %rdx
- orq %rcx, %rdx
- cmpq $1, %rdx
- jbe .LBB2_5
- ud2
-.LBB2_5:
- callq *%rax
- .loc 1 12 1
- movl -8(%rbp), %eax
- addq $32, %rsp
- popq %rbp
- retq
-.Ltmp5:
-.Lfunc_end2:
- .size main, .Lfunc_end2-main
- .cfi_endproc
-
- .p2align 3, 0x90
- .type .L.cfi.jumptable,@function
-.L.cfi.jumptable:
-.Lfunc_begin3:
- .cfi_startproc
- #APP
- jmp _Z1av.cfi@PLT
- int3
- int3
- int3
- jmp _Z1bv.cfi@PLT
- int3
- int3
- int3
-
- #NO_APP
-.Lfunc_end3:
- .size .L.cfi.jumptable, .Lfunc_end3-.L.cfi.jumptable
- .cfi_endproc
-
- .section .debug_str,"MS",@progbits,1
-.Linfo_string0:
- .asciz "clang version 6.0.0 (trunk 316774)"
-.Linfo_string1:
- .asciz "tiny.cc"
-.Linfo_string2:
- .asciz ""
- .section .debug_abbrev,"",@progbits
- .byte 1
- .byte 17
- .byte 0
- .byte 37
- .byte 14
- .byte 19
- .byte 5
- .byte 3
- .byte 14
- .byte 16
- .byte 23
- .byte 27
- .byte 14
- .byte 17
- .byte 1
- .byte 18
- .byte 6
- .byte 0
- .byte 0
- .byte 0
- .section .debug_info,"",@progbits
-.Lcu_begin0:
- .long 38
- .short 4
- .long .debug_abbrev
- .byte 8
- .byte 1
- .long .Linfo_string0
- .short 4
- .long .Linfo_string1
- .long .Lline_table_start0
- .long .Linfo_string2
- .quad .Lfunc_begin0
- .long .Lfunc_end2-.Lfunc_begin0
- .section .debug_ranges,"",@progbits
- .section .debug_macinfo,"",@progbits
-.Lcu_macro_begin0:
- .byte 0
-
- .type _Z1av,@function
-_Z1av = .L.cfi.jumptable
- .type _Z1bv,@function
-_Z1bv = .L.cfi.jumptable+8
- .ident "clang version 6.0.0 (trunk 316774)"
- .section ".note.GNU-stack","",@progbits
- .section .debug_line,"",@progbits
-.Lline_table_start0:
-
+# CHECK: Expected Protected: 1 (100.00%)
+# CHECK: Unexpected Protected: 0 (0.00%)
+# CHECK: Expected Unprotected: 0 (0.00%)
+# CHECK: Unexpected Unprotected (BAD): 0 (0.00%)
diff --git a/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s
index d8819e16e37..65782cb5e42 100644
--- a/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s
+++ b/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s
@@ -1,167 +1,11 @@
-# RUN: llvm-mc %s -filetype obj -triple x86_64-linux-elf -o %t.o
+# RUN: llvm-mc %S/Inputs/unprotected-lineinfo.s -filetype obj \
+# RUN: -triple x86_64-linux-elf -o %t.o
# RUN: llvm-cfi-verify %t.o | FileCheck %s
# CHECK-LABEL: U
# CHECK-NEXT: tiny.cc:11
-# CHECK: Unprotected: 1 (100.00%), Protected: 0 (0.00%)
-
-# Source (tiny.cc):
-# void a() {}
-# void b() {}
-# int main(int argc, char** argv) {
-# void(*ptr)();
-# if (argc == 1)
-# ptr = &a;
-# else
-# ptr = &b;
-# ptr();
-# }
-# Compile with:
-# clang++ -gmlt tiny.cc -S -o tiny.s
-
- .text
- .file "tiny.cc"
- .globl _Z1av # -- Begin function _Z1av
- .p2align 4, 0x90
- .type _Z1av,@function
-_Z1av: # @_Z1av
-.Lfunc_begin0:
- .file 1 "tiny.cc"
- .loc 1 1 0 # tiny.cc:1:0
- .cfi_startproc
-# BB#0:
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
-.Ltmp0:
- .loc 1 1 11 prologue_end # tiny.cc:1:11
- popq %rbp
- retq
-.Ltmp1:
-.Lfunc_end0:
- .size _Z1av, .Lfunc_end0-_Z1av
- .cfi_endproc
- # -- End function
- .globl _Z1bv # -- Begin function _Z1bv
- .p2align 4, 0x90
- .type _Z1bv,@function
-_Z1bv: # @_Z1bv
-.Lfunc_begin1:
- .loc 1 2 0 # tiny.cc:2:0
- .cfi_startproc
-# BB#0:
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
-.Ltmp2:
- .loc 1 2 11 prologue_end # tiny.cc:2:11
- popq %rbp
- retq
-.Ltmp3:
-.Lfunc_end1:
- .size _Z1bv, .Lfunc_end1-_Z1bv
- .cfi_endproc
- # -- End function
- .globl main # -- Begin function main
- .p2align 4, 0x90
- .type main,@function
-main: # @main
-.Lfunc_begin2:
- .loc 1 4 0 # tiny.cc:4:0
- .cfi_startproc
-# BB#0:
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
- subq $32, %rsp
- movl $0, -4(%rbp)
- movl %edi, -8(%rbp)
- movq %rsi, -16(%rbp)
-.Ltmp4:
- .loc 1 6 12 prologue_end # tiny.cc:6:12
- cmpl $1, -8(%rbp)
- .loc 1 6 7 is_stmt 0 # tiny.cc:6:7
- jne .LBB2_2
-# BB#1:
- .loc 1 0 7 # tiny.cc:0:7
- movabsq $_Z1av, %rax
- .loc 1 7 9 is_stmt 1 # tiny.cc:7:9
- movq %rax, -24(%rbp)
- .loc 1 7 5 is_stmt 0 # tiny.cc:7:5
- jmp .LBB2_3
-.LBB2_2:
- .loc 1 0 5 # tiny.cc:0:5
- movabsq $_Z1bv, %rax
- .loc 1 9 9 is_stmt 1 # tiny.cc:9:9
- movq %rax, -24(%rbp)
-.LBB2_3:
- .loc 1 11 3 # tiny.cc:11:3
- callq *-24(%rbp)
- .loc 1 12 1 # tiny.cc:12:1
- movl -4(%rbp), %eax
- addq $32, %rsp
- popq %rbp
- retq
-.Ltmp5:
-.Lfunc_end2:
- .size main, .Lfunc_end2-main
- .cfi_endproc
- # -- End function
- .section .debug_str,"MS",@progbits,1
-.Linfo_string0:
- .asciz "clang version 6.0.0 (trunk 316774)" # string offset=0
-.Linfo_string1:
- .asciz "tiny.cc" # string offset=35
-.Linfo_string2:
- .asciz "/tmp/a/b" # string offset=43
- .section .debug_abbrev,"",@progbits
- .byte 1 # Abbreviation Code
- .byte 17 # DW_TAG_compile_unit
- .byte 0 # DW_CHILDREN_no
- .byte 37 # DW_AT_producer
- .byte 14 # DW_FORM_strp
- .byte 19 # DW_AT_language
- .byte 5 # DW_FORM_data2
- .byte 3 # DW_AT_name
- .byte 14 # DW_FORM_strp
- .byte 16 # DW_AT_stmt_list
- .byte 23 # DW_FORM_sec_offset
- .byte 27 # DW_AT_comp_dir
- .byte 14 # DW_FORM_strp
- .byte 17 # DW_AT_low_pc
- .byte 1 # DW_FORM_addr
- .byte 18 # DW_AT_high_pc
- .byte 6 # DW_FORM_data4
- .byte 0 # EOM(1)
- .byte 0 # EOM(2)
- .byte 0 # EOM(3)
- .section .debug_info,"",@progbits
-.Lcu_begin0:
- .long 38 # Length of Unit
- .short 4 # DWARF version number
- .long .debug_abbrev # Offset Into Abbrev. Section
- .byte 8 # Address Size (in bytes)
- .byte 1 # Abbrev [1] 0xb:0x1f DW_TAG_compile_unit
- .long .Linfo_string0 # DW_AT_producer
- .short 4 # DW_AT_language
- .long .Linfo_string1 # DW_AT_name
- .long .Lline_table_start0 # DW_AT_stmt_list
- .long .Linfo_string2 # DW_AT_comp_dir
- .quad .Lfunc_begin0 # DW_AT_low_pc
- .long .Lfunc_end2-.Lfunc_begin0 # DW_AT_high_pc
- .section .debug_ranges,"",@progbits
- .section .debug_macinfo,"",@progbits
-.Lcu_macro_begin0:
- .byte 0 # End Of Macro List Mark
-
- .ident "clang version 6.0.0 (trunk 316774)"
- .section ".note.GNU-stack","",@progbits
- .section .debug_line,"",@progbits
-.Lline_table_start0:
+# CHECK: Expected Protected: 0 (0.00%)
+# CHECK: Unexpected Protected: 0 (0.00%)
+# CHECK: Expected Unprotected: 0 (0.00%)
+# CHECK: Unexpected Unprotected (BAD): 1 (100.00%)
diff --git a/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s b/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s
index c023a4a84ab..246acf35f5b 100644
--- a/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s
+++ b/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s
@@ -1,92 +1,5 @@
-# RUN: llvm-mc %s -filetype obj -triple x86_64-linux-elf -o %t.o
+# RUN: llvm-mc %S/Inputs/unprotected-nolineinfo.s -filetype obj \
+# RUN: -triple x86_64-linux-elf -o %t.o
# RUN: not llvm-cfi-verify %t.o 2>&1 | FileCheck %s
# CHECK: DWARF line information missing. Did you compile with '-g'?
-
-# Source (tiny.cc):
-# void a() {}
-# void b() {}
-# int main(int argc, char** argv) {
-# void(*ptr)();
-# if (argc == 1)
-# ptr = &a;
-# else
-# ptr = &b;
-# ptr();
-# }
-# Compile with:
-# clang++ tiny.cc -S -o tiny.s
-
- .text
- .file "tiny.cc"
- .globl _Z1av # -- Begin function _Z1av
- .p2align 4, 0x90
- .type _Z1av,@function
-_Z1av: # @_Z1av
- .cfi_startproc
-# BB#0:
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
- popq %rbp
- retq
-.Lfunc_end0:
- .size _Z1av, .Lfunc_end0-_Z1av
- .cfi_endproc
- # -- End function
- .globl _Z1bv # -- Begin function _Z1bv
- .p2align 4, 0x90
- .type _Z1bv,@function
-_Z1bv: # @_Z1bv
- .cfi_startproc
-# BB#0:
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
- popq %rbp
- retq
-.Lfunc_end1:
- .size _Z1bv, .Lfunc_end1-_Z1bv
- .cfi_endproc
- # -- End function
- .globl main # -- Begin function main
- .p2align 4, 0x90
- .type main,@function
-main: # @main
- .cfi_startproc
-# BB#0:
- pushq %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- movq %rsp, %rbp
- .cfi_def_cfa_register %rbp
- subq $32, %rsp
- movl $0, -4(%rbp)
- movl %edi, -8(%rbp)
- movq %rsi, -16(%rbp)
- cmpl $1, -8(%rbp)
- jne .LBB2_2
-# BB#1:
- movabsq $_Z1av, %rax
- movq %rax, -24(%rbp)
- jmp .LBB2_3
-.LBB2_2:
- movabsq $_Z1bv, %rax
- movq %rax, -24(%rbp)
-.LBB2_3:
- callq *-24(%rbp)
- movl -4(%rbp), %eax
- addq $32, %rsp
- popq %rbp
- retq
-.Lfunc_end2:
- .size main, .Lfunc_end2-main
- .cfi_endproc
- # -- End function
-
- .ident "clang version 6.0.0 (trunk 316774)"
- .section ".note.GNU-stack","",@progbits
diff --git a/test/LibDriver/Inputs/a.s b/test/tools/llvm-lib/Inputs/a.s
index 88258e2797f..88258e2797f 100644
--- a/test/LibDriver/Inputs/a.s
+++ b/test/tools/llvm-lib/Inputs/a.s
diff --git a/test/LibDriver/Inputs/b.s b/test/tools/llvm-lib/Inputs/b.s
index 4890c9247c7..4890c9247c7 100644
--- a/test/LibDriver/Inputs/b.s
+++ b/test/tools/llvm-lib/Inputs/b.s
diff --git a/test/LibDriver/Inputs/cl-gl.obj b/test/tools/llvm-lib/Inputs/cl-gl.obj
index ff746557d41..ff746557d41 100755
--- a/test/LibDriver/Inputs/cl-gl.obj
+++ b/test/tools/llvm-lib/Inputs/cl-gl.obj
Binary files differ
diff --git a/test/LibDriver/Inputs/resource.res b/test/tools/llvm-lib/Inputs/resource.res
index f1c799fbbb0..f1c799fbbb0 100644
--- a/test/LibDriver/Inputs/resource.res
+++ b/test/tools/llvm-lib/Inputs/resource.res
Binary files differ
diff --git a/test/LibDriver/infer-output-path.test b/test/tools/llvm-lib/infer-output-path.test
index c63b0abdf6e..c63b0abdf6e 100644
--- a/test/LibDriver/infer-output-path.test
+++ b/test/tools/llvm-lib/infer-output-path.test
diff --git a/test/LibDriver/invalid.test b/test/tools/llvm-lib/invalid.test
index 2978177a431..2978177a431 100644
--- a/test/LibDriver/invalid.test
+++ b/test/tools/llvm-lib/invalid.test
diff --git a/test/LibDriver/libpath.test b/test/tools/llvm-lib/libpath.test
index 26a1e8dc8b6..26a1e8dc8b6 100644
--- a/test/LibDriver/libpath.test
+++ b/test/tools/llvm-lib/libpath.test
diff --git a/test/tools/llvm-lib/lit.local.cfg b/test/tools/llvm-lib/lit.local.cfg
new file mode 100644
index 00000000000..e71f3cc4c41
--- /dev/null
+++ b/test/tools/llvm-lib/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'X86' in config.root.targets:
+ config.unsupported = True
+
diff --git a/test/LibDriver/no-inputs.test b/test/tools/llvm-lib/no-inputs.test
index 95d6555d58c..95d6555d58c 100644
--- a/test/LibDriver/no-inputs.test
+++ b/test/tools/llvm-lib/no-inputs.test
diff --git a/test/LibDriver/resource.test b/test/tools/llvm-lib/resource.test
index 6c3dad50b45..6c3dad50b45 100644
--- a/test/LibDriver/resource.test
+++ b/test/tools/llvm-lib/resource.test
diff --git a/test/LibDriver/thin.test b/test/tools/llvm-lib/thin.test
index c401de41a80..c401de41a80 100644
--- a/test/LibDriver/thin.test
+++ b/test/tools/llvm-lib/thin.test
diff --git a/test/LibDriver/use-paths.test b/test/tools/llvm-lib/use-paths.test
index 971c216127e..971c216127e 100644
--- a/test/LibDriver/use-paths.test
+++ b/test/tools/llvm-lib/use-paths.test
diff --git a/test/tools/llvm-nm/X86/externalonly.test b/test/tools/llvm-nm/X86/externalonly.test
index c3741298786..2a1853b426f 100644
--- a/test/tools/llvm-nm/X86/externalonly.test
+++ b/test/tools/llvm-nm/X86/externalonly.test
@@ -1,4 +1,5 @@
# RUN: llvm-nm -g %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s
+# RUN: llvm-nm -g -g %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s
# CHECK-NOT: EH_frame0
# CHECK: _main
diff --git a/test/tools/llvm-nm/X86/importlibrary.test b/test/tools/llvm-nm/X86/importlibrary.test
index 9111694c2c6..107628d09ef 100644
--- a/test/tools/llvm-nm/X86/importlibrary.test
+++ b/test/tools/llvm-nm/X86/importlibrary.test
@@ -1,5 +1,7 @@
# RUN: llvm-nm -B %S/Inputs/example.lib | FileCheck --match-full-lines %s
+CHECK: 00000000 I __IMPORT_DESCRIPTOR_example
+CHECK: 00000000 I __NULL_IMPORT_DESCRIPTOR
CHECK: 00000000 R __imp__constant
CHECK: 00000000 R _constant
CHECK: 00000000 D __imp__data
diff --git a/test/tools/llvm-objcopy/Inputs/dwarf.dwo b/test/tools/llvm-objcopy/Inputs/dwarf.dwo
new file mode 100644
index 00000000000..4b6fd505506
--- /dev/null
+++ b/test/tools/llvm-objcopy/Inputs/dwarf.dwo
Binary files differ
diff --git a/test/tools/llvm-objcopy/check-addr-offset-align-binary.test b/test/tools/llvm-objcopy/check-addr-offset-align-binary.test
new file mode 100644
index 00000000000..755acceeda2
--- /dev/null
+++ b/test/tools/llvm-objcopy/check-addr-offset-align-binary.test
@@ -0,0 +1,40 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objcopy -O binary %t %t2
+# RUN: od -t x1 %t2 | FileCheck %s
+
+!ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_EXEC
+ Machine: EM_X86_64
+Sections:
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1000
+ AddressAlign: 0x0000000000001000
+ Content: "c3c3c3c3"
+ - Name: .data
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Address: 0x1008
+ AddressAlign: 0x0000000000000008
+ Content: "3232"
+ProgramHeaders:
+ - Type: PT_LOAD
+ Flags: [ PF_X, PF_R ]
+ VAddr: 0x1000
+ PAddr: 0x1000
+ Align: 0x1000
+ Sections:
+ - Section: .text
+ - Type: PT_LOAD
+ Flags: [ PF_R, PF_W ]
+ VAddr: 0x1008
+ PAddr: 0x1008
+ Align: 0x1000
+ Sections:
+ - Section: .data
+
+# CHECK: 0000000 c3 c3 c3 c3 00 00 00 00 32 32
diff --git a/test/tools/llvm-objcopy/check-addr-offset-align.test b/test/tools/llvm-objcopy/check-addr-offset-align.test
new file mode 100644
index 00000000000..ca2367ba434
--- /dev/null
+++ b/test/tools/llvm-objcopy/check-addr-offset-align.test
@@ -0,0 +1,67 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objcopy %t %t2
+# RUN: llvm-readobj -program-headers %t2 | FileCheck %s
+
+!ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_EXEC
+ Machine: EM_X86_64
+Sections:
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1000
+ AddressAlign: 0x0000000000001000
+ Content: "c3c3c3c3"
+ - Name: .data
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Address: 0x1008
+ AddressAlign: 0x0000000000000008
+ Content: "3232"
+ProgramHeaders:
+ - Type: PT_LOAD
+ Flags: [ PF_X, PF_R ]
+ VAddr: 0x1000
+ PAddr: 0x1000
+ Align: 0x1000
+ Sections:
+ - Section: .text
+ - Type: PT_LOAD
+ Flags: [ PF_R, PF_W ]
+ VAddr: 0x1008
+ PAddr: 0x1008
+ Align: 0x1000
+ Sections:
+ - Section: .data
+
+#CHECK: ProgramHeaders [
+#CHECK-NEXT: ProgramHeader {
+#CHECK-NEXT: Type: PT_LOAD
+#CHECK-NEXT: Offset: 0x1000
+#CHECK-NEXT: VirtualAddress: 0x1000
+#CHECK-NEXT: PhysicalAddress: 0x1000
+#CHECK-NEXT: FileSize: 4
+#CHECK-NEXT: MemSize: 4
+#CHECK-NEXT: Flags [
+#CHECK-NEXT: PF_R
+#CHECK-NEXT: PF_X
+#CHECK-NEXT: ]
+#CHECK-NEXT: Alignment: 4096
+#CHECK-NEXT: }
+#CHECK-NEXT: ProgramHeader {
+#CHECK-NEXT: Type: PT_LOAD
+#CHECK-NEXT: Offset: 0x1008
+#CHECK-NEXT: VirtualAddress: 0x1008
+#CHECK-NEXT: PhysicalAddress: 0x1008
+#CHECK-NEXT: FileSize: 2
+#CHECK-NEXT: MemSize: 2
+#CHECK-NEXT: Flags [
+#CHECK-NEXT: PF_R
+#CHECK-NEXT: PF_W
+#CHECK-NEXT: ]
+#CHECK-NEXT: Alignment: 4096
+#CHECK-NEXT: }
+#CHECK-NEXT:]
diff --git a/test/tools/llvm-objcopy/drawf-fission.test b/test/tools/llvm-objcopy/drawf-fission.test
new file mode 100644
index 00000000000..112bffbc891
--- /dev/null
+++ b/test/tools/llvm-objcopy/drawf-fission.test
@@ -0,0 +1,43 @@
+# RUN: llvm-objcopy -extract-dwo %p/Inputs/dwarf.dwo %t
+# RUN: llvm-objcopy -strip-dwo %p/Inputs/dwarf.dwo %t2
+# RUN: llvm-objcopy -split-dwo=%t3 %p/Inputs/dwarf.dwo %t4
+# RUN: llvm-readobj -file-headers -sections %t | FileCheck %s -check-prefix=DWARF
+# RUN: llvm-readobj -file-headers -sections %t2 | FileCheck %s -check-prefix=STRIP
+# RUN: diff %t %t3
+# RUN: diff %t2 %t4
+
+#DWARF: SectionHeaderCount: 8
+
+#DWARF: Name: .debug_loc.dwo
+#DWARF: Name: .debug_str.dwo
+#DWARF: Name: .debug_str_offsets.dwo
+#DWARF: Name: .debug_info.dwo
+#DWARF: Name: .debug_abbrev.dwo
+#DWARF: Name: .debug_line.dwo
+#DWARF: Name: .strtab
+
+#STRIP: SectionHeaderCount: 24
+
+#STRIP: Name: .text
+#STRIP: Name: .rodata.str1.1
+#STRIP: Name: .debug_str
+#STRIP: Name: .debug_abbrev
+#STRIP: Name: .debug_info
+#STRIP: Name: .debug_ranges
+#STRIP: Name: .debug_macinfo
+#STRIP: Name: .debug_addr
+#STRIP: Name: .debug_pubnames
+#STRIP: Name: .debug_pubtypes
+#STRIP: Name: .comment
+#STRIP: Name: .note.GNU-stack
+#STRIP: Name: .debug_frame
+#STRIP: Name: .debug_line
+#STRIP: Name: .symtab
+#STRIP: Name: .rela.text
+#STRIP: Name: .rela.debug_info
+#STRIP: Name: .rela.debug_addr
+#STRIP: Name: .rela.debug_pubnames
+#STRIP: Name: .rela.debug_pubtypes
+#STRIP: Name: .rela.debug_frame
+#STRIP: Name: .rela.debug_line
+#STRIP: Name: .strtab
diff --git a/test/tools/llvm-objdump/X86/Inputs/macho-invalid-reloc-section-index b/test/tools/llvm-objdump/X86/Inputs/macho-invalid-reloc-section-index
new file mode 100644
index 00000000000..a9d0b48449b
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/macho-invalid-reloc-section-index
Binary files differ
diff --git a/test/tools/llvm-objdump/X86/malformed-machos.test b/test/tools/llvm-objdump/X86/malformed-machos.test
index 292666a3725..e29df464a4e 100644
--- a/test/tools/llvm-objdump/X86/malformed-machos.test
+++ b/test/tools/llvm-objdump/X86/malformed-machos.test
@@ -66,3 +66,6 @@ INVALID-SYMBOL-LIB_ORDINAL: macho-invalid-symbol-lib_ordinal': truncated or malf
RUN: not llvm-objdump -macho -objc-meta-data %p/Inputs/macho-invalid-bind-entry 2>&1 | FileCheck -check-prefix INVALID-BIND-ENTRY %s
INVALID-BIND-ENTRY: macho-invalid-bind-entry': truncated or malformed object (for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB bad library ordinal: 83 (max 0) for opcode at: 0x0)
+
+RUN: llvm-objdump -macho -r %p/Inputs/macho-invalid-reloc-section-index | FileCheck -check-prefix INVALID-RELOC-SECTION-INDEX %s
+INVALID-RELOC-SECTION-INDEX: 0000000000000021 X86_64_RELOC_UNSIGNED 8388613 (?,?)
diff --git a/tools/dsymutil/DwarfLinker.cpp b/tools/dsymutil/DwarfLinker.cpp
index 9fb968cb5d2..0fdc690dab4 100644
--- a/tools/dsymutil/DwarfLinker.cpp
+++ b/tools/dsymutil/DwarfLinker.cpp
@@ -2366,7 +2366,7 @@ void DwarfLinker::keepDIEAndDependencies(RelocationManager &RelocMgr,
continue;
}
- Val.extractValue(Data, &Offset, &Unit);
+ Val.extractValue(Data, &Offset, Unit.getFormParams(), &Unit);
CompileUnit *ReferencedCU;
if (auto RefDie =
resolveDIEReference(*this, Units, Val, Unit, Die, ReferencedCU)) {
@@ -2965,7 +2965,7 @@ DIE *DwarfLinker::DIECloner::cloneDIE(
DWARFFormValue Val(AttrSpec.Form);
uint32_t AttrSize = Offset;
- Val.extractValue(Data, &Offset, &U);
+ Val.extractValue(Data, &Offset, U.getFormParams(), &U);
AttrSize = Offset - AttrSize;
OutOffset +=
@@ -3158,7 +3158,7 @@ void DwarfLinker::patchLineTableForUnit(CompileUnit &Unit,
DWARFDataExtractor LineExtractor(
OrigDwarf.getDWARFObj(), OrigDwarf.getDWARFObj().getLineSection(),
OrigDwarf.isLittleEndian(), Unit.getOrigUnit().getAddressByteSize());
- LineTable.parse(LineExtractor, &StmtOffset);
+ LineTable.parse(LineExtractor, &StmtOffset, &Unit.getOrigUnit());
// This vector is the output line table.
std::vector<DWARFDebugLine::Row> NewRows;
diff --git a/tools/dsymutil/dsymutil.cpp b/tools/dsymutil/dsymutil.cpp
index b6d6c909abc..9d9a2418379 100644
--- a/tools/dsymutil/dsymutil.cpp
+++ b/tools/dsymutil/dsymutil.cpp
@@ -93,8 +93,8 @@ static list<std::string> ArchFlags(
"arch",
desc("Link DWARF debug information only for specified CPU architecture\n"
"types. This option can be specified multiple times, once for each\n"
- "desired architecture. All cpu architectures will be linked by\n"
- "default."),
+ "desired architecture. All CPU architectures will be linked by\n"
+ "default."), value_desc("arch"),
ZeroOrMore, cat(DsymCategory));
static opt<bool>
@@ -338,7 +338,6 @@ int main(int argc, char **argv) {
NumThreads = 1;
NumThreads = std::min<unsigned>(NumThreads, DebugMapPtrsOrErr->size());
- llvm::ThreadPool Threads(NumThreads);
// If there is more than one link to execute, we need to generate
// temporary files.
@@ -366,17 +365,19 @@ int main(int argc, char **argv) {
// FIXME: The DwarfLinker can have some very deep recursion that can max
// out the (significantly smaller) stack when using threads. We don't
// want this limitation when we only have a single thread.
- if (NumThreads == 1)
+ if (NumThreads == 1) {
LinkLambda();
- else
+ } else {
+ llvm::ThreadPool Threads(NumThreads);
Threads.async(LinkLambda);
+ Threads.wait();
+ }
if (NeedsTempFiles)
TempFiles.emplace_back(Map->getTriple().getArchName().str(),
OutputFile);
}
- Threads.wait();
if (NeedsTempFiles &&
!MachOUtils::generateUniversalBinary(
diff --git a/tools/llvm-ar/CMakeLists.txt b/tools/llvm-ar/CMakeLists.txt
index 731bcbd8ac9..2970a59beee 100644
--- a/tools/llvm-ar/CMakeLists.txt
+++ b/tools/llvm-ar/CMakeLists.txt
@@ -17,3 +17,9 @@ add_llvm_tool(llvm-ar
add_llvm_tool_symlink(llvm-ranlib llvm-ar)
add_llvm_tool_symlink(llvm-lib llvm-ar)
add_llvm_tool_symlink(llvm-dlltool llvm-ar)
+
+if(LLVM_INSTALL_BINUTILS_SYMLINKS)
+ add_llvm_tool_symlink(ar llvm-ar)
+ add_llvm_tool_symlink(dlltool llvm-ar)
+ add_llvm_tool_symlink(ranlib llvm-ar)
+endif()
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index 576265cfe59..8c19f6b6af8 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -127,6 +127,8 @@ static cl::extrahelp MoreHelp(
" [v] - be verbose about actions taken\n"
);
+static const char OptionChars[] = "dmpqrtxabiosSTucv";
+
// This enumeration delineates the kinds of operations on an archive
// that are permitted.
enum ArchiveOperation {
@@ -864,6 +866,24 @@ int main(int argc, char **argv) {
Stem.find("lib") != StringRef::npos)
return libDriverMain(makeArrayRef(argv, argc));
+ for (int i = 1; i < argc; i++) {
+ // If an argument starts with a dash and only contains chars
+ // that belong to the options chars set, remove the dash.
+ // We can't handle it after the command line options parsing
+ // is done, since it will error out on an unrecognized string
+ // starting with a dash.
+ // Make sure this doesn't match the actual llvm-ar specific options
+ // that start with a dash.
+ StringRef S = argv[i];
+ if (S.startswith("-") &&
+ S.find_first_not_of(OptionChars, 1) == StringRef::npos) {
+ argv[i]++;
+ break;
+ }
+ if (S == "--")
+ break;
+ }
+
// Have the command line options parsed and handle things
// like --help and --version.
cl::ParseCommandLineOptions(argc, argv,
diff --git a/tools/llvm-cfi-verify/CMakeLists.txt b/tools/llvm-cfi-verify/CMakeLists.txt
index 07c6504bf48..de6a46e7859 100644
--- a/tools/llvm-cfi-verify/CMakeLists.txt
+++ b/tools/llvm-cfi-verify/CMakeLists.txt
@@ -4,11 +4,11 @@ set(LLVM_LINK_COMPONENTS
AllTargetsDescs
AllTargetsDisassemblers
AllTargetsInfos
- DebugInfoDWARF
MC
MCParser
Object
Support
+ Symbolize
)
add_llvm_tool(llvm-cfi-verify
diff --git a/tools/llvm-cfi-verify/LLVMBuild.txt b/tools/llvm-cfi-verify/LLVMBuild.txt
index 5c4ce263090..d5e93230272 100644
--- a/tools/llvm-cfi-verify/LLVMBuild.txt
+++ b/tools/llvm-cfi-verify/LLVMBuild.txt
@@ -19,4 +19,4 @@
type = Tool
name = llvm-cfi-verify
parent = Tools
-required_libraries = all-targets DebugInfoDWARF MC MCDisassembler MCParser Support
+required_libraries = all-targets MC MCDisassembler MCParser Support Symbolize
diff --git a/tools/llvm-cfi-verify/lib/CMakeLists.txt b/tools/llvm-cfi-verify/lib/CMakeLists.txt
index c90e4ed485e..cd728e004b2 100644
--- a/tools/llvm-cfi-verify/lib/CMakeLists.txt
+++ b/tools/llvm-cfi-verify/lib/CMakeLists.txt
@@ -11,5 +11,7 @@ llvm_map_components_to_libnames(libs
MC
MCParser
Object
- Support)
+ Support
+ Symbolize)
target_link_libraries(LLVMCFIVerify ${libs})
+set_target_properties(LLVMCFIVerify PROPERTIES FOLDER "Libraries") \ No newline at end of file
diff --git a/tools/llvm-cfi-verify/lib/FileAnalysis.cpp b/tools/llvm-cfi-verify/lib/FileAnalysis.cpp
index 278e861dfd3..0d4e1f497ff 100644
--- a/tools/llvm-cfi-verify/lib/FileAnalysis.cpp
+++ b/tools/llvm-cfi-verify/lib/FileAnalysis.cpp
@@ -39,22 +39,20 @@
#include <functional>
using Instr = llvm::cfi_verify::FileAnalysis::Instr;
+using LLVMSymbolizer = llvm::symbolize::LLVMSymbolizer;
namespace llvm {
namespace cfi_verify {
-static cl::opt<bool> IgnoreDWARF(
+bool IgnoreDWARFFlag;
+
+static cl::opt<bool, true> IgnoreDWARFArg(
"ignore-dwarf",
cl::desc(
"Ignore all DWARF data. This relaxes the requirements for all "
"statically linked libraries to have been compiled with '-g', but "
"will result in false positives for 'CFI unprotected' instructions."),
- cl::init(false));
-
-cl::opt<unsigned long long> DWARFSearchRange(
- "dwarf-search-range",
- cl::desc("Address search range used to determine if instruction is valid."),
- cl::init(0x10));
+ cl::location(IgnoreDWARFFlag), cl::init(false));
Expected<FileAnalysis> FileAnalysis::Create(StringRef Filename) {
// Open the filename provided.
@@ -256,12 +254,16 @@ const MCInstrAnalysis *FileAnalysis::getMCInstrAnalysis() const {
return MIA.get();
}
+LLVMSymbolizer &FileAnalysis::getSymbolizer() { return *Symbolizer; }
+
Error FileAnalysis::initialiseDisassemblyMembers() {
std::string TripleName = ObjectTriple.getTriple();
ArchName = "";
MCPU = "";
std::string ErrorString;
+ Symbolizer.reset(new LLVMSymbolizer());
+
ObjectTarget =
TargetRegistry::lookupTarget(ArchName, ObjectTriple, ErrorString);
if (!ObjectTarget)
@@ -308,8 +310,8 @@ Error FileAnalysis::initialiseDisassemblyMembers() {
}
Error FileAnalysis::parseCodeSections() {
- if (!IgnoreDWARF) {
- DWARF.reset(DWARFContext::create(*Object).release());
+ if (!IgnoreDWARFFlag) {
+ std::unique_ptr<DWARFContext> DWARF = DWARFContext::create(*Object);
if (!DWARF)
return make_error<StringError>("Could not create DWARF information.",
inconvertibleErrorCode());
@@ -347,21 +349,9 @@ Error FileAnalysis::parseCodeSections() {
return Error::success();
}
-DILineInfoTable FileAnalysis::getLineInfoForAddressRange(uint64_t Address) {
- if (!hasLineTableInfo())
- return DILineInfoTable();
-
- return DWARF->getLineInfoForAddressRange(Address, DWARFSearchRange);
-}
-
-bool FileAnalysis::hasValidLineInfoForAddressRange(uint64_t Address) {
- return !getLineInfoForAddressRange(Address).empty();
-}
-
-bool FileAnalysis::hasLineTableInfo() const { return DWARF != nullptr; }
-
void FileAnalysis::parseSectionContents(ArrayRef<uint8_t> SectionBytes,
uint64_t SectionAddress) {
+ assert(Symbolizer && "Symbolizer is uninitialised.");
MCInst Instruction;
Instr InstrMeta;
uint64_t InstructionSize;
@@ -381,8 +371,19 @@ void FileAnalysis::parseSectionContents(ArrayRef<uint8_t> SectionBytes,
InstrMeta.Valid = ValidInstruction;
// Check if this instruction exists in the range of the DWARF metadata.
- if (hasLineTableInfo() && !hasValidLineInfoForAddressRange(VMAddress))
- continue;
+ if (!IgnoreDWARFFlag) {
+ auto LineInfo =
+ Symbolizer->symbolizeCode(Object->getFileName(), VMAddress);
+ if (!LineInfo) {
+ handleAllErrors(LineInfo.takeError(), [](const ErrorInfoBase &E) {
+ errs() << "Symbolizer failed to get line: " << E.message() << "\n";
+ });
+ continue;
+ }
+
+ if (LineInfo->FileName == "<invalid>")
+ continue;
+ }
addInstruction(InstrMeta);
diff --git a/tools/llvm-cfi-verify/lib/FileAnalysis.h b/tools/llvm-cfi-verify/lib/FileAnalysis.h
index 9945a2110a2..e0eecb037c3 100644
--- a/tools/llvm-cfi-verify/lib/FileAnalysis.h
+++ b/tools/llvm-cfi-verify/lib/FileAnalysis.h
@@ -12,7 +12,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/Symbolize/Symbolize.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
@@ -44,6 +44,8 @@
namespace llvm {
namespace cfi_verify {
+extern bool IgnoreDWARFFlag;
+
// Disassembler and analysis tool for machine code files. Keeps track of non-
// sequential control flows, including indirect control flow instructions.
class FileAnalysis {
@@ -120,6 +122,7 @@ public:
const MCRegisterInfo *getRegisterInfo() const;
const MCInstrInfo *getMCInstrInfo() const;
const MCInstrAnalysis *getMCInstrAnalysis() const;
+ symbolize::LLVMSymbolizer &getSymbolizer();
// Returns true if this class is using DWARF line tables for elimination.
bool hasLineTableInfo() const;
@@ -175,8 +178,8 @@ private:
std::unique_ptr<const MCInstrAnalysis> MIA;
std::unique_ptr<MCInstPrinter> Printer;
- // DWARF debug information.
- std::unique_ptr<DWARFContext> DWARF;
+ // Symbolizer used for debug information parsing.
+ std::unique_ptr<symbolize::LLVMSymbolizer> Symbolizer;
// A mapping between the virtual memory address to the instruction metadata
// struct. TODO(hctim): Reimplement this as a sorted vector to avoid per-
diff --git a/tools/llvm-cfi-verify/lib/LLVMBuild.txt b/tools/llvm-cfi-verify/lib/LLVMBuild.txt
index 99b678fc88a..c0ae1905521 100644
--- a/tools/llvm-cfi-verify/lib/LLVMBuild.txt
+++ b/tools/llvm-cfi-verify/lib/LLVMBuild.txt
@@ -19,4 +19,4 @@
type = Library
name = CFIVerify
parent = Libraries
-required_libraries = DebugInfoDWARF MC MCDisassembler MCParser Support
+required_libraries = DebugInfoDWARF MC MCDisassembler MCParser Support Symbolize
diff --git a/tools/llvm-cfi-verify/llvm-cfi-verify.cpp b/tools/llvm-cfi-verify/llvm-cfi-verify.cpp
index d4a46fcc226..3b4a5c155d0 100644
--- a/tools/llvm-cfi-verify/llvm-cfi-verify.cpp
+++ b/tools/llvm-cfi-verify/llvm-cfi-verify.cpp
@@ -23,6 +23,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/SpecialCaseList.h"
#include <cstdlib>
@@ -32,48 +33,122 @@ using namespace llvm::cfi_verify;
cl::opt<std::string> InputFilename(cl::Positional, cl::desc("<input file>"),
cl::Required);
+cl::opt<std::string> BlacklistFilename(cl::Positional,
+ cl::desc("[blacklist file]"),
+ cl::init("-"));
ExitOnError ExitOnErr;
-void printIndirectCFInstructions(FileAnalysis &Analysis) {
- uint64_t ProtectedCount = 0;
- uint64_t UnprotectedCount = 0;
+void printIndirectCFInstructions(FileAnalysis &Analysis,
+ const SpecialCaseList *SpecialCaseList) {
+ uint64_t ExpectedProtected = 0;
+ uint64_t UnexpectedProtected = 0;
+ uint64_t ExpectedUnprotected = 0;
+ uint64_t UnexpectedUnprotected = 0;
+
+ symbolize::LLVMSymbolizer &Symbolizer = Analysis.getSymbolizer();
for (uint64_t Address : Analysis.getIndirectInstructions()) {
const auto &InstrMeta = Analysis.getInstructionOrDie(Address);
- if (Analysis.isIndirectInstructionCFIProtected(Address)) {
+ bool CFIProtected = Analysis.isIndirectInstructionCFIProtected(Address);
+
+ if (CFIProtected)
outs() << "P ";
- ProtectedCount++;
- } else {
+ else
outs() << "U ";
- UnprotectedCount++;
- }
outs() << format_hex(Address, 2) << " | "
<< Analysis.getMCInstrInfo()->getName(
InstrMeta.Instruction.getOpcode())
- << " ";
- outs() << "\n";
-
- if (Analysis.hasLineTableInfo()) {
- for (const auto &LineKV : Analysis.getLineInfoForAddressRange(Address)) {
- outs() << " " << format_hex(LineKV.first, 2) << " = "
- << LineKV.second.FileName << ":" << LineKV.second.Line << ":"
- << LineKV.second.Column << " (" << LineKV.second.FunctionName
- << ")\n";
+ << " \n";
+
+ if (IgnoreDWARFFlag) {
+ if (CFIProtected)
+ ExpectedProtected++;
+ else
+ UnexpectedUnprotected++;
+ continue;
+ }
+
+ auto InliningInfo = Symbolizer.symbolizeInlinedCode(InputFilename, Address);
+ if (!InliningInfo || InliningInfo->getNumberOfFrames() == 0) {
+ errs() << "Failed to symbolise " << format_hex(Address, 2)
+ << " with line tables from " << InputFilename << "\n";
+ exit(EXIT_FAILURE);
+ }
+
+ const auto &LineInfo =
+ InliningInfo->getFrame(InliningInfo->getNumberOfFrames() - 1);
+
+ // Print the inlining symbolisation of this instruction.
+ for (uint32_t i = 0; i < InliningInfo->getNumberOfFrames(); ++i) {
+ const auto &Line = InliningInfo->getFrame(i);
+ outs() << " " << format_hex(Address, 2) << " = " << Line.FileName << ":"
+ << Line.Line << ":" << Line.Column << " (" << Line.FunctionName
+ << ")\n";
+ }
+
+ if (!SpecialCaseList) {
+ if (CFIProtected)
+ ExpectedProtected++;
+ else
+ UnexpectedUnprotected++;
+ continue;
+ }
+
+ bool MatchesBlacklistRule = false;
+ if (SpecialCaseList->inSection("cfi-icall", "src", LineInfo.FileName) ||
+ SpecialCaseList->inSection("cfi-vcall", "src", LineInfo.FileName)) {
+ outs() << "BLACKLIST MATCH, 'src'\n";
+ MatchesBlacklistRule = true;
+ }
+
+ if (SpecialCaseList->inSection("cfi-icall", "fun", LineInfo.FunctionName) ||
+ SpecialCaseList->inSection("cfi-vcall", "fun", LineInfo.FunctionName)) {
+ outs() << "BLACKLIST MATCH, 'fun'\n";
+ MatchesBlacklistRule = true;
+ }
+
+ if (MatchesBlacklistRule) {
+ if (CFIProtected) {
+ UnexpectedProtected++;
+ outs() << "====> Unexpected Protected\n";
+ } else {
+ ExpectedUnprotected++;
+ outs() << "====> Expected Unprotected\n";
+ }
+ } else {
+ if (CFIProtected) {
+ ExpectedProtected++;
+ outs() << "====> Expected Protected\n";
+ } else {
+ UnexpectedUnprotected++;
+ outs() << "====> Unexpected Unprotected\n";
}
}
}
- if (ProtectedCount || UnprotectedCount)
- outs() << formatv(
- "Unprotected: {0} ({1:P}), Protected: {2} ({3:P})\n", UnprotectedCount,
- (((double)UnprotectedCount) / (UnprotectedCount + ProtectedCount)),
- ProtectedCount,
- (((double)ProtectedCount) / (UnprotectedCount + ProtectedCount)));
- else
+ uint64_t IndirectCFInstructions = ExpectedProtected + UnexpectedProtected +
+ ExpectedUnprotected + UnexpectedUnprotected;
+
+ if (IndirectCFInstructions == 0) {
outs() << "No indirect CF instructions found.\n";
+ return;
+ }
+
+ outs() << formatv("Expected Protected: {0} ({1:P})\n"
+ "Unexpected Protected: {2} ({3:P})\n"
+ "Expected Unprotected: {4} ({5:P})\n"
+ "Unexpected Unprotected (BAD): {6} ({7:P})\n",
+ ExpectedProtected,
+ ((double)ExpectedProtected) / IndirectCFInstructions,
+ UnexpectedProtected,
+ ((double)UnexpectedProtected) / IndirectCFInstructions,
+ ExpectedUnprotected,
+ ((double)ExpectedUnprotected) / IndirectCFInstructions,
+ UnexpectedUnprotected,
+ ((double)UnexpectedUnprotected) / IndirectCFInstructions);
}
int main(int argc, char **argv) {
@@ -89,8 +164,18 @@ int main(int argc, char **argv) {
InitializeAllAsmParsers();
InitializeAllDisassemblers();
+ std::unique_ptr<SpecialCaseList> SpecialCaseList;
+ if (BlacklistFilename != "-") {
+ std::string Error;
+ SpecialCaseList = SpecialCaseList::create({BlacklistFilename}, Error);
+ if (!SpecialCaseList) {
+ errs() << "Failed to get blacklist: " << Error << "\n";
+ exit(EXIT_FAILURE);
+ }
+ }
+
FileAnalysis Analysis = ExitOnErr(FileAnalysis::Create(InputFilename));
- printIndirectCFInstructions(Analysis);
+ printIndirectCFInstructions(Analysis, SpecialCaseList.get());
return EXIT_SUCCESS;
}
diff --git a/tools/llvm-cov/gcov.cpp b/tools/llvm-cov/gcov.cpp
index 4df7f015fd1..7776f2aa9a6 100644
--- a/tools/llvm-cov/gcov.cpp
+++ b/tools/llvm-cov/gcov.cpp
@@ -11,11 +11,11 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ProfileData/GCOV.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Errc.h"
#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/GCOV.h"
#include "llvm/Support/Path.h"
#include <system_error>
using namespace llvm;
diff --git a/tools/llvm-cvtres/llvm-cvtres.cpp b/tools/llvm-cvtres/llvm-cvtres.cpp
index 36c15925e84..433a75f63dc 100644
--- a/tools/llvm-cvtres/llvm-cvtres.cpp
+++ b/tools/llvm-cvtres/llvm-cvtres.cpp
@@ -202,7 +202,7 @@ int main(int argc_, const char *argv_[]) {
auto FileOrErr =
FileOutputBuffer::create(OutputFile, OutputBuffer->getBufferSize());
if (!FileOrErr)
- reportError(OutputFile, FileOrErr.getError());
+ reportError(OutputFile, errorToErrorCode(FileOrErr.takeError()));
std::unique_ptr<FileOutputBuffer> FileBuffer = std::move(*FileOrErr);
std::copy(OutputBuffer->getBufferStart(), OutputBuffer->getBufferEnd(),
FileBuffer->getBufferStart());
diff --git a/tools/llvm-cxxdump/llvm-cxxdump.cpp b/tools/llvm-cxxdump/llvm-cxxdump.cpp
index b10759ad05c..69b1a8ef209 100644
--- a/tools/llvm-cxxdump/llvm-cxxdump.cpp
+++ b/tools/llvm-cxxdump/llvm-cxxdump.cpp
@@ -546,11 +546,10 @@ int main(int argc, const char *argv[]) {
cl::ParseCommandLineOptions(argc, argv, "LLVM C++ ABI Data Dumper\n");
// Default to stdin if no filename is specified.
- if (opts::InputFilenames.size() == 0)
- opts::InputFilenames.push_back("-");
-
- std::for_each(opts::InputFilenames.begin(), opts::InputFilenames.end(),
- dumpInput);
-
- return EXIT_SUCCESS;
-}
+ if (opts::InputFilenames.size() == 0)
+ opts::InputFilenames.push_back("-");
+
+ llvm::for_each(opts::InputFilenames, dumpInput);
+
+ return EXIT_SUCCESS;
+}
diff --git a/tools/llvm-cxxfilt/CMakeLists.txt b/tools/llvm-cxxfilt/CMakeLists.txt
index 488064d08da..2a78acad80a 100644
--- a/tools/llvm-cxxfilt/CMakeLists.txt
+++ b/tools/llvm-cxxfilt/CMakeLists.txt
@@ -6,3 +6,7 @@ set(LLVM_LINK_COMPONENTS
add_llvm_tool(llvm-cxxfilt
llvm-cxxfilt.cpp
)
+
+if(LLVM_INSTALL_BINUTILS_SYMLINKS)
+ add_llvm_tool_symlink(c++filt llvm-cxxfilt)
+endif()
diff --git a/tools/llvm-dwp/CMakeLists.txt b/tools/llvm-dwp/CMakeLists.txt
index 98d67e04fe6..1b5fbddc1f7 100644
--- a/tools/llvm-dwp/CMakeLists.txt
+++ b/tools/llvm-dwp/CMakeLists.txt
@@ -15,3 +15,7 @@ add_llvm_tool(llvm-dwp
DEPENDS
intrinsics_gen
)
+
+if(LLVM_INSTALL_BINUTILS_SYMLINKS)
+ add_llvm_tool_symlink(dwp llvm-dwp)
+endif()
diff --git a/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp b/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
index 5a0d6ac4f47..96dbc245ed9 100644
--- a/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
+++ b/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
@@ -13,6 +13,7 @@
#include "llvm/MC/SubtargetFeature.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInstPrinter.h"
#include "llvm/MC/MCInstrInfo.h"
@@ -84,7 +85,7 @@ class LLVMFuzzerInputBuffer : public MemoryBuffer
{
public:
LLVMFuzzerInputBuffer(const uint8_t *data_, size_t size_)
- : Data(reinterpret_cast<const char *>(data_)),
+ : Data(reinterpret_cast<const char *>(data_)),
Size(size_) {
init(Data, Data+Size, false);
}
@@ -230,7 +231,8 @@ int AssembleOneInput(const uint8_t *Data, size_t Size) {
MCAsmBackend *MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, MCPU,
MCOptions);
Str.reset(TheTarget->createMCObjectStreamer(
- TheTriple, Ctx, *MAB, *OS, CE, *STI, MCOptions.MCRelaxAll,
+ TheTriple, Ctx, std::unique_ptr<MCAsmBackend>(MAB), *OS,
+ std::unique_ptr<MCCodeEmitter>(CE), *STI, MCOptions.MCRelaxAll,
MCOptions.MCIncrementalLinkerCompatible,
/*DWARFMustBeAtTheEnd*/ false));
}
diff --git a/tools/llvm-mcmarkup/llvm-mcmarkup.cpp b/tools/llvm-mcmarkup/llvm-mcmarkup.cpp
index 0be3c715eee..db57a6bdaa8 100644
--- a/tools/llvm-mcmarkup/llvm-mcmarkup.cpp
+++ b/tools/llvm-mcmarkup/llvm-mcmarkup.cpp
@@ -217,10 +217,9 @@ int main(int argc, char **argv) {
ToolName = argv[0];
// If no input files specified, read from stdin.
- if (InputFilenames.size() == 0)
- InputFilenames.push_back("-");
-
- std::for_each(InputFilenames.begin(), InputFilenames.end(),
- parseMCMarkup);
- return 0;
-}
+ if (InputFilenames.size() == 0)
+ InputFilenames.push_back("-");
+
+ llvm::for_each(InputFilenames, parseMCMarkup);
+ return 0;
+}
diff --git a/tools/llvm-mt/llvm-mt.cpp b/tools/llvm-mt/llvm-mt.cpp
index 9bc9d332ebf..23cedb056a6 100644
--- a/tools/llvm-mt/llvm-mt.cpp
+++ b/tools/llvm-mt/llvm-mt.cpp
@@ -146,10 +146,10 @@ int main(int argc, const char **argv) {
std::unique_ptr<MemoryBuffer> OutputBuffer = Merger.getMergedManifest();
if (!OutputBuffer)
reportError("empty manifest not written");
- ErrorOr<std::unique_ptr<FileOutputBuffer>> FileOrErr =
+ Expected<std::unique_ptr<FileOutputBuffer>> FileOrErr =
FileOutputBuffer::create(OutputFile, OutputBuffer->getBufferSize());
if (!FileOrErr)
- reportError(OutputFile, FileOrErr.getError());
+ reportError(OutputFile, errorToErrorCode(FileOrErr.takeError()));
std::unique_ptr<FileOutputBuffer> FileBuffer = std::move(*FileOrErr);
std::copy(OutputBuffer->getBufferStart(), OutputBuffer->getBufferEnd(),
FileBuffer->getBufferStart());
diff --git a/tools/llvm-nm/CMakeLists.txt b/tools/llvm-nm/CMakeLists.txt
index 08bcd5f3089..f093cc4328a 100644
--- a/tools/llvm-nm/CMakeLists.txt
+++ b/tools/llvm-nm/CMakeLists.txt
@@ -14,3 +14,7 @@ add_llvm_tool(llvm-nm
DEPENDS
intrinsics_gen
)
+
+if(LLVM_INSTALL_BINUTILS_SYMLINKS)
+ add_llvm_tool_symlink(nm llvm-nm)
+endif()
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index 4ad0d95d67f..d2909644628 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -85,9 +85,11 @@ cl::alias DefinedOnly2("U", cl::desc("Alias for --defined-only"),
cl::aliasopt(DefinedOnly), cl::Grouping);
cl::opt<bool> ExternalOnly("extern-only",
- cl::desc("Show only external symbols"));
+ cl::desc("Show only external symbols"),
+ cl::ZeroOrMore);
cl::alias ExternalOnly2("g", cl::desc("Alias for --extern-only"),
- cl::aliasopt(ExternalOnly), cl::Grouping);
+ cl::aliasopt(ExternalOnly), cl::Grouping,
+ cl::ZeroOrMore);
cl::opt<bool> BSDFormat("B", cl::desc("Alias for --format=bsd"),
cl::Grouping);
@@ -946,6 +948,10 @@ static char getSymbolNMTypeChar(COFFObjectFile &Obj, symbol_iterator I) {
section_iterator SecI = *SecIOrErr;
const coff_section *Section = Obj.getCOFFSection(*SecI);
Characteristics = Section->Characteristics;
+ StringRef SectionName;
+ Obj.getSectionName(Section, SectionName);
+ if (SectionName.startswith(".idata"))
+ return 'i';
}
switch (Symb.getSectionNumber()) {
@@ -1971,8 +1977,7 @@ int main(int argc, char **argv) {
if (NoDyldInfo && (AddDyldInfo || DyldInfoOnly))
error("-no-dyldinfo can't be used with -add-dyldinfo or -dyldinfo-only");
- std::for_each(InputFilenames.begin(), InputFilenames.end(),
- dumpSymbolNamesFromFile);
+ llvm::for_each(InputFilenames, dumpSymbolNamesFromFile);
if (HadError)
return 1;
diff --git a/tools/llvm-objcopy/CMakeLists.txt b/tools/llvm-objcopy/CMakeLists.txt
index 18cc2075345..05aa727ab9d 100644
--- a/tools/llvm-objcopy/CMakeLists.txt
+++ b/tools/llvm-objcopy/CMakeLists.txt
@@ -7,3 +7,7 @@ add_llvm_tool(llvm-objcopy
llvm-objcopy.cpp
Object.cpp
)
+
+if(LLVM_INSTALL_BINUTILS_SYMLINKS)
+ add_llvm_tool_symlink(objcopy llvm-objcopy)
+endif()
diff --git a/tools/llvm-objcopy/Object.cpp b/tools/llvm-objcopy/Object.cpp
index 22ae47f1cac..5f9864d9cc0 100644
--- a/tools/llvm-objcopy/Object.cpp
+++ b/tools/llvm-objcopy/Object.cpp
@@ -685,6 +685,19 @@ template <class ELFT> void ELFObject<ELFT>::sortSections() {
CompareSections);
}
+static uint64_t alignToAddr(uint64_t Offset, uint64_t Addr, uint64_t Align) {
+ // Calculate Diff such that (Offset + Diff) & -Align == Addr & -Align.
+ if (Align == 0)
+ Align = 1;
+ auto Diff =
+ static_cast<int64_t>(Addr % Align) - static_cast<int64_t>(Offset % Align);
+ // We only want to add to Offset, however, so if Diff < 0 we can add Align and
+ // (Offset + Diff) & -Align == Addr & -Align will still hold.
+ if (Diff < 0)
+ Diff += Align;
+ return Offset + Diff;
+}
+
template <class ELFT> void ELFObject<ELFT>::assignOffsets() {
// We need a temporary list of segments that has a special order to it
// so that we know that anytime ->ParentSegment is set that segment has
@@ -728,7 +741,7 @@ template <class ELFT> void ELFObject<ELFT>::assignOffsets() {
Segment->Offset =
Parent->Offset + Segment->OriginalOffset - Parent->OriginalOffset;
} else {
- Offset = alignTo(Offset, Segment->Align == 0 ? 1 : Segment->Align);
+ Offset = alignToAddr(Offset, Segment->VAddr, Segment->Align);
Segment->Offset = Offset;
}
Offset = std::max(Offset, Segment->Offset + Segment->FileSize);
@@ -829,8 +842,9 @@ template <class ELFT> void BinaryObject<ELFT>::finalize() {
uint64_t Offset = 0;
for (auto &Segment : this->Segments) {
- if (Segment->Type == PT_LOAD && Segment->firstSection() != nullptr) {
- Offset = alignTo(Offset, Segment->Align);
+ if (Segment->Type == llvm::ELF::PT_LOAD &&
+ Segment->firstSection() != nullptr) {
+ Offset = alignToAddr(Offset, Segment->VAddr, Segment->Align);
Segment->Offset = Offset;
Offset += Segment->FileSize;
}
diff --git a/tools/llvm-objcopy/Object.h b/tools/llvm-objcopy/Object.h
index 9c77f5900ce..f12e6da7d21 100644
--- a/tools/llvm-objcopy/Object.h
+++ b/tools/llvm-objcopy/Object.h
@@ -368,6 +368,7 @@ public:
Object(const object::ELFObjectFile<ELFT> &Obj);
virtual ~Object() = default;
+ const SectionBase *getSectionHeaderStrTab() const { return SectionNames; }
void removeSections(std::function<bool(const SectionBase &)> ToRemove);
virtual size_t totalSize() const = 0;
virtual void finalize() = 0;
diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp
index f3e9c7750a6..c923f902db8 100644
--- a/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -83,12 +83,63 @@ static cl::alias ToRemoveA("R", cl::desc("Alias for remove-section"),
cl::aliasopt(ToRemove));
static cl::opt<bool> StripSections("strip-sections",
cl::desc("Remove all section headers"));
+static cl::opt<bool>
+ StripDWO("strip-dwo", cl::desc("remove all DWARF .dwo sections from file"));
+static cl::opt<bool> ExtractDWO(
+ "extract-dwo",
+ cl::desc("remove all sections that are not DWARF .dwo sections from file"));
+static cl::opt<std::string>
+ SplitDWO("split-dwo",
+ cl::desc("equivalent to extract-dwo on the input file to "
+ "<dwo-file>, then strip-dwo on the input file"),
+ cl::value_desc("dwo-file"));
using SectionPred = std::function<bool(const SectionBase &Sec)>;
-void CopyBinary(const ELFObjectFile<ELF64LE> &ObjFile) {
+bool IsDWOSection(const SectionBase &Sec) {
+ return Sec.Name.endswith(".dwo");
+}
+
+template <class ELFT>
+bool OnlyKeepDWOPred(const Object<ELFT> &Obj, const SectionBase &Sec) {
+ // We can't remove the section header string table.
+ if (&Sec == Obj.getSectionHeaderStrTab())
+ return false;
+ // Short of keeping the string table we want to keep everything that is a DWO
+ // section and remove everything else.
+ return !IsDWOSection(Sec);
+}
+
+template <class ELFT>
+void WriteObjectFile(const Object<ELFT> &Obj, StringRef File) {
std::unique_ptr<FileOutputBuffer> Buffer;
+ Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
+ FileOutputBuffer::create(File, Obj.totalSize(),
+ FileOutputBuffer::F_executable);
+ if (BufferOrErr.takeError())
+ error("failed to open " + OutputFilename);
+ else
+ Buffer = std::move(*BufferOrErr);
+ Obj.write(*Buffer);
+ if (auto E = Buffer->commit())
+ reportError(File, errorToErrorCode(std::move(E)));
+}
+
+template <class ELFT>
+void SplitDWOToFile(const ELFObjectFile<ELFT> &ObjFile, StringRef File) {
+ // Construct a second output file for the DWO sections.
+ ELFObject<ELFT> DWOFile(ObjFile);
+
+ DWOFile.removeSections([&](const SectionBase &Sec) {
+ return OnlyKeepDWOPred<ELFT>(DWOFile, Sec);
+ });
+ DWOFile.finalize();
+ WriteObjectFile(DWOFile, File);
+}
+
+void CopyBinary(const ELFObjectFile<ELF64LE> &ObjFile) {
std::unique_ptr<Object<ELF64LE>> Obj;
+
if (!OutputFormat.empty() && OutputFormat != "binary")
error("invalid output format '" + OutputFormat + "'");
if (!OutputFormat.empty() && OutputFormat == "binary")
@@ -96,6 +147,9 @@ void CopyBinary(const ELFObjectFile<ELF64LE> &ObjFile) {
else
Obj = llvm::make_unique<ELFObject<ELF64LE>>(ObjFile);
+ if (!SplitDWO.empty())
+ SplitDWOToFile<ELF64LE>(ObjFile, SplitDWO.getValue());
+
SectionPred RemovePred = [](const SectionBase &) { return false; };
if (!ToRemove.empty()) {
@@ -105,6 +159,16 @@ void CopyBinary(const ELFObjectFile<ELF64LE> &ObjFile) {
};
}
+ if (StripDWO || !SplitDWO.empty())
+ RemovePred = [RemovePred](const SectionBase &Sec) {
+ return IsDWOSection(Sec) || RemovePred(Sec);
+ };
+
+ if (ExtractDWO)
+ RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
+ return OnlyKeepDWOPred(*Obj, Sec) || RemovePred(Sec);
+ };
+
if (StripSections) {
RemovePred = [RemovePred](const SectionBase &Sec) {
return RemovePred(Sec) || (Sec.Flags & SHF_ALLOC) == 0;
@@ -113,21 +177,8 @@ void CopyBinary(const ELFObjectFile<ELF64LE> &ObjFile) {
}
Obj->removeSections(RemovePred);
-
Obj->finalize();
- ErrorOr<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
- FileOutputBuffer::create(OutputFilename, Obj->totalSize(),
- FileOutputBuffer::F_executable);
- if (BufferOrErr.getError())
- error("failed to open " + OutputFilename);
- else
- Buffer = std::move(*BufferOrErr);
- std::error_code EC;
- if (EC)
- report_fatal_error(EC.message());
- Obj->write(*Buffer);
- if (auto EC = Buffer->commit())
- reportError(OutputFilename, EC);
+ WriteObjectFile(*Obj, OutputFilename.getValue());
}
int main(int argc, char **argv) {
diff --git a/tools/llvm-objdump/CMakeLists.txt b/tools/llvm-objdump/CMakeLists.txt
index 27e6145dfc1..043a181d639 100644
--- a/tools/llvm-objdump/CMakeLists.txt
+++ b/tools/llvm-objdump/CMakeLists.txt
@@ -25,3 +25,7 @@ add_llvm_tool(llvm-objdump
if(HAVE_LIBXAR)
target_link_libraries(llvm-objdump ${XAR_LIB})
endif()
+
+if(LLVM_INSTALL_BINUTILS_SYMLINKS)
+ add_llvm_tool_symlink(objdump llvm-objdump)
+endif()
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 09396466c40..02eaa89f088 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -865,8 +865,19 @@ static void printRelocationTargetName(const MachOObjectFile *O,
} else {
section_iterator SI = O->section_begin();
// Adjust for the fact that sections are 1-indexed.
- advance(SI, Val - 1);
- SI->getName(S);
+ if (Val == 0) {
+ fmt << "0 (?,?)";
+ return;
+ }
+ uint32_t i = Val - 1;
+ while (i != 0 && SI != O->section_end()) {
+ i--;
+ advance(SI, 1);
+ }
+ if (SI == O->section_end())
+ fmt << Val << " (?,?)";
+ else
+ SI->getName(S);
}
fmt << S;
@@ -2183,11 +2194,10 @@ int main(int argc, char **argv) {
&& !PrintFaultMaps
&& DwarfDumpType == DIDT_Null) {
cl::PrintHelpMessage();
- return 2;
- }
-
- std::for_each(InputFilenames.begin(), InputFilenames.end(),
- DumpInput);
-
- return EXIT_SUCCESS;
-}
+ return 2;
+ }
+
+ llvm::for_each(InputFilenames, DumpInput);
+
+ return EXIT_SUCCESS;
+}
diff --git a/tools/llvm-pdbutil/llvm-pdbutil.cpp b/tools/llvm-pdbutil/llvm-pdbutil.cpp
index 8b2d5ce179f..bee9f182e3f 100644
--- a/tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ b/tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -1199,20 +1199,17 @@ int main(int argc_, const char *argv_[]) {
opts::pretty::ExcludeCompilands.push_back(
"f:\\\\binaries\\\\Intermediate\\\\vctools\\\\crt_bld");
opts::pretty::ExcludeCompilands.push_back("f:\\\\dd\\\\vctools\\\\crt");
- opts::pretty::ExcludeCompilands.push_back(
- "d:\\\\th.obj.x86fre\\\\minkernel");
- }
- std::for_each(opts::pretty::InputFilenames.begin(),
- opts::pretty::InputFilenames.end(), dumpPretty);
- } else if (opts::DumpSubcommand) {
- std::for_each(opts::dump::InputFilenames.begin(),
- opts::dump::InputFilenames.end(), dumpRaw);
- } else if (opts::BytesSubcommand) {
- std::for_each(opts::bytes::InputFilenames.begin(),
- opts::bytes::InputFilenames.end(), dumpBytes);
- } else if (opts::DiffSubcommand) {
- for (StringRef S : opts::diff::RawModiEquivalences) {
- StringRef Left;
+ opts::pretty::ExcludeCompilands.push_back(
+ "d:\\\\th.obj.x86fre\\\\minkernel");
+ }
+ llvm::for_each(opts::pretty::InputFilenames, dumpPretty);
+ } else if (opts::DumpSubcommand) {
+ llvm::for_each(opts::dump::InputFilenames, dumpRaw);
+ } else if (opts::BytesSubcommand) {
+ llvm::for_each(opts::bytes::InputFilenames, dumpBytes);
+ } else if (opts::DiffSubcommand) {
+ for (StringRef S : opts::diff::RawModiEquivalences) {
+ StringRef Left;
StringRef Right;
std::tie(Left, Right) = S.split(',');
uint32_t X, Y;
diff --git a/tools/llvm-readobj/CMakeLists.txt b/tools/llvm-readobj/CMakeLists.txt
index 54471674173..dafc9e10cfa 100644
--- a/tools/llvm-readobj/CMakeLists.txt
+++ b/tools/llvm-readobj/CMakeLists.txt
@@ -23,3 +23,7 @@ add_llvm_tool(llvm-readobj
)
add_llvm_tool_symlink(llvm-readelf llvm-readobj)
+
+if(LLVM_INSTALL_BINUTILS_SYMLINKS)
+ add_llvm_tool_symlink(readelf llvm-readobj)
+endif()
diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index 05b7c800cc1..851988110ea 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -566,14 +566,13 @@ int main(int argc, const char *argv[]) {
cl::ParseCommandLineOptions(argc, argv, "LLVM Object Reader\n");
// Default to stdin if no filename is specified.
- if (opts::InputFilenames.size() == 0)
- opts::InputFilenames.push_back("-");
-
- std::for_each(opts::InputFilenames.begin(), opts::InputFilenames.end(),
- dumpInput);
-
- if (opts::CodeViewMergedTypes) {
- ScopedPrinter W(outs());
+ if (opts::InputFilenames.size() == 0)
+ opts::InputFilenames.push_back("-");
+
+ llvm::for_each(opts::InputFilenames, dumpInput);
+
+ if (opts::CodeViewMergedTypes) {
+ ScopedPrinter W(outs());
dumpCodeViewMergedTypes(W, CVTypes.IDTable, CVTypes.TypeTable);
}
diff --git a/tools/llvm-size/CMakeLists.txt b/tools/llvm-size/CMakeLists.txt
index 60345739c35..7ef4f1769b8 100644
--- a/tools/llvm-size/CMakeLists.txt
+++ b/tools/llvm-size/CMakeLists.txt
@@ -6,3 +6,7 @@ set(LLVM_LINK_COMPONENTS
add_llvm_tool(llvm-size
llvm-size.cpp
)
+
+if(LLVM_INSTALL_BINUTILS_SYMLINKS)
+ add_llvm_tool_symlink(size llvm-size)
+endif()
diff --git a/tools/llvm-size/llvm-size.cpp b/tools/llvm-size/llvm-size.cpp
index bdb118a264e..7a8e744d2e6 100644
--- a/tools/llvm-size/llvm-size.cpp
+++ b/tools/llvm-size/llvm-size.cpp
@@ -880,14 +880,13 @@ int main(int argc, char **argv) {
}
if (InputFilenames.size() == 0)
- InputFilenames.push_back("a.out");
-
- MoreThanOneFile = InputFilenames.size() > 1;
- std::for_each(InputFilenames.begin(), InputFilenames.end(),
- printFileSectionSizes);
- if (OutputFormat == berkeley && TotalSizes)
- printBerkelyTotals();
-
+ InputFilenames.push_back("a.out");
+
+ MoreThanOneFile = InputFilenames.size() > 1;
+ llvm::for_each(InputFilenames, printFileSectionSizes);
+ if (OutputFormat == berkeley && TotalSizes)
+ printBerkelyTotals();
+
if (HadError)
return 1;
}
diff --git a/tools/llvm-strings/CMakeLists.txt b/tools/llvm-strings/CMakeLists.txt
index 9339892a499..390f1175139 100644
--- a/tools/llvm-strings/CMakeLists.txt
+++ b/tools/llvm-strings/CMakeLists.txt
@@ -8,3 +8,6 @@ add_llvm_tool(llvm-strings
llvm-strings.cpp
)
+if(LLVM_INSTALL_BINUTILS_SYMLINKS)
+ add_llvm_tool_symlink(strings llvm-strings)
+endif()
diff --git a/tools/llvm-symbolizer/CMakeLists.txt b/tools/llvm-symbolizer/CMakeLists.txt
index b04c45ff744..d9b05208afd 100644
--- a/tools/llvm-symbolizer/CMakeLists.txt
+++ b/tools/llvm-symbolizer/CMakeLists.txt
@@ -14,3 +14,7 @@ set(LLVM_LINK_COMPONENTS
add_llvm_tool(llvm-symbolizer
llvm-symbolizer.cpp
)
+
+if(LLVM_INSTALL_BINUTILS_SYMLINKS)
+ add_llvm_tool_symlink(addr2line llvm-symbolizer)
+endif()
diff --git a/tools/llvm-xray/trie-node.h b/tools/llvm-xray/trie-node.h
new file mode 100644
index 00000000000..e6ba4e215b9
--- /dev/null
+++ b/tools/llvm-xray/trie-node.h
@@ -0,0 +1,92 @@
+//===- trie-node.h - XRay Call Stack Data Structure -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides a data structure and routines for working with call stacks
+// of instrumented functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_XRAY_STACK_TRIE_H
+#define LLVM_TOOLS_LLVM_XRAY_STACK_TRIE_H
+
+#include <forward_list>
+#include <numeric>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+
+/// A type to represent a trie of invocations. It is useful to construct a
+/// graph of these nodes from reading an XRay trace, such that each function
+/// call can be placed in a larger context.
+///
+/// The template parameter allows users of the template to attach their own
+/// data elements to each node in the invocation graph.
+template <typename AssociatedData> struct TrieNode {
+ /// The function ID.
+ int32_t FuncId;
+
+ /// The caller of this function.
+ TrieNode<AssociatedData> *Parent;
+
+ /// The callees from this function.
+ llvm::SmallVector<TrieNode<AssociatedData> *, 4> Callees;
+
+ /// Additional parameterized data on each node.
+ AssociatedData ExtraData;
+};
+
+/// Merges together two TrieNodes with like function ids, aggregating their
+/// callee lists and durations. The caller must provide storage where new merged
+/// nodes can be allocated in the form of a linked list.
+template <typename T, typename Callable>
+TrieNode<T> *
+mergeTrieNodes(const TrieNode<T> &Left, const TrieNode<T> &Right,
+ /*Non-deduced pointer type for nullptr compatibility*/
+ typename std::remove_reference<TrieNode<T> *>::type NewParent,
+ std::forward_list<TrieNode<T>> &NodeStore,
+ Callable &&MergeCallable) {
+ llvm::function_ref<T(const T &, const T &)> MergeFn(
+ std::forward<Callable>(MergeCallable));
+ assert(Left.FuncId == Right.FuncId);
+ NodeStore.push_front(TrieNode<T>{
+ Left.FuncId, NewParent, {}, MergeFn(Left.ExtraData, Right.ExtraData)});
+ auto I = NodeStore.begin();
+ auto *Node = &*I;
+
+ // Build a map of callees from the left side.
+ llvm::DenseMap<int32_t, TrieNode<T> *> LeftCalleesByFuncId;
+ for (auto *Callee : Left.Callees) {
+ LeftCalleesByFuncId[Callee->FuncId] = Callee;
+ }
+
+ // Iterate through the right side, either merging with the map values or
+ // directly adding to the Callees vector. The iteration also removes any
+ // merged values from the left side map.
+ // TODO: Unroll into iterative and explicit stack for efficiency.
+ for (auto *Callee : Right.Callees) {
+ auto iter = LeftCalleesByFuncId.find(Callee->FuncId);
+ if (iter != LeftCalleesByFuncId.end()) {
+ Node->Callees.push_back(
+ mergeTrieNodes(*(iter->second), *Callee, Node, NodeStore, MergeFn));
+ LeftCalleesByFuncId.erase(iter);
+ } else {
+ Node->Callees.push_back(Callee);
+ }
+ }
+
+ // Add any callees that weren't found in the right side.
+ for (auto MapPairIter : LeftCalleesByFuncId) {
+ Node->Callees.push_back(MapPairIter.second);
+ }
+
+ return Node;
+}
+
+#endif // LLVM_TOOLS_LLVM_XRAY_STACK_TRIE_H
diff --git a/tools/llvm-xray/xray-converter.cc b/tools/llvm-xray/xray-converter.cc
index f1aec65bc67..aa0da55207b 100644
--- a/tools/llvm-xray/xray-converter.cc
+++ b/tools/llvm-xray/xray-converter.cc
@@ -12,10 +12,12 @@
//===----------------------------------------------------------------------===//
#include "xray-converter.h"
+#include "trie-node.h"
#include "xray-registry.h"
#include "llvm/DebugInfo/Symbolize/Symbolize.h"
#include "llvm/Support/EndianStream.h"
#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/ScopedPrinter.h"
#include "llvm/Support/YAMLTraits.h"
#include "llvm/Support/raw_ostream.h"
@@ -32,11 +34,14 @@ static cl::SubCommand Convert("convert", "Trace Format Conversion");
static cl::opt<std::string> ConvertInput(cl::Positional,
cl::desc("<xray log file>"),
cl::Required, cl::sub(Convert));
-enum class ConvertFormats { BINARY, YAML };
+enum class ConvertFormats { BINARY, YAML, CHROME_TRACE_EVENT };
static cl::opt<ConvertFormats> ConvertOutputFormat(
"output-format", cl::desc("output format"),
cl::values(clEnumValN(ConvertFormats::BINARY, "raw", "output in binary"),
- clEnumValN(ConvertFormats::YAML, "yaml", "output in yaml")),
+ clEnumValN(ConvertFormats::YAML, "yaml", "output in yaml"),
+ clEnumValN(ConvertFormats::CHROME_TRACE_EVENT, "trace_event",
+ "Output in chrome's trace event format. "
+ "May be visualized with the Catapult trace viewer.")),
cl::sub(Convert));
static cl::alias ConvertOutputFormat2("f", cl::aliasopt(ConvertOutputFormat),
cl::desc("Alias for -output-format"),
@@ -142,6 +147,192 @@ void TraceConverter::exportAsRAWv1(const Trace &Records, raw_ostream &OS) {
}
}
+namespace {
+
+// A structure that allows building a dictionary of stack ids for the Chrome
+// trace event format.
+struct StackIdData {
+ // Each Stack of function calls has a unique ID.
+ unsigned id;
+
+ // Bookkeeping so that IDs can be maintained uniquely across threads.
+ // Traversal keeps sibling pointers to other threads stacks. This is helpful
+ // to determine when a thread encounters a new stack and should assign a new
+ // unique ID.
+ SmallVector<TrieNode<StackIdData> *, 4> siblings;
+};
+
+using StackTrieNode = TrieNode<StackIdData>;
+
+// A helper function to find the sibling nodes for an encountered function in a
+// thread of execution. Relies on the invariant that each time a new node is
+// traversed in a thread, sibling bidirectional pointers are maintained.
+SmallVector<StackTrieNode *, 4>
+findSiblings(StackTrieNode *parent, int32_t FnId, uint32_t TId,
+ const DenseMap<uint32_t, SmallVector<StackTrieNode *, 4>>
+ &StackRootsByThreadId) {
+
+ SmallVector<StackTrieNode *, 4> Siblings{};
+
+ if (parent == nullptr) {
+ for (auto map_iter : StackRootsByThreadId) {
+ // Only look for siblings in other threads.
+ if (map_iter.first != TId)
+ for (auto node_iter : map_iter.second) {
+ if (node_iter->FuncId == FnId)
+ Siblings.push_back(node_iter);
+ }
+ }
+ return Siblings;
+ }
+
+ for (auto *ParentSibling : parent->ExtraData.siblings)
+ for (auto node_iter : ParentSibling->Callees)
+ if (node_iter->FuncId == FnId)
+ Siblings.push_back(node_iter);
+
+ return Siblings;
+}
+
+// Given a function being invoked in a thread with id TId, finds and returns the
+// StackTrie representing the function call stack. If no node exists, creates
+// the node. Assigns unique IDs to stacks newly encountered among all threads
+// and keeps sibling links up to when creating new nodes.
+StackTrieNode *findOrCreateStackNode(
+ StackTrieNode *Parent, int32_t FuncId, uint32_t TId,
+ DenseMap<uint32_t, SmallVector<StackTrieNode *, 4>> &StackRootsByThreadId,
+ DenseMap<unsigned, StackTrieNode *> &StacksByStackId, unsigned *id_counter,
+ std::forward_list<StackTrieNode> &NodeStore) {
+ SmallVector<StackTrieNode *, 4> &ParentCallees =
+ Parent == nullptr ? StackRootsByThreadId[TId] : Parent->Callees;
+ auto match = find_if(ParentCallees, [FuncId](StackTrieNode *ParentCallee) {
+ return FuncId == ParentCallee->FuncId;
+ });
+ if (match != ParentCallees.end())
+ return *match;
+
+ SmallVector<StackTrieNode *, 4> siblings =
+ findSiblings(Parent, FuncId, TId, StackRootsByThreadId);
+ if (siblings.empty()) {
+ NodeStore.push_front({FuncId, Parent, {}, {(*id_counter)++, {}}});
+ StackTrieNode *CurrentStack = &NodeStore.front();
+ StacksByStackId[*id_counter - 1] = CurrentStack;
+ ParentCallees.push_back(CurrentStack);
+ return CurrentStack;
+ }
+ unsigned stack_id = siblings[0]->ExtraData.id;
+ NodeStore.push_front({FuncId, Parent, {}, {stack_id, std::move(siblings)}});
+ StackTrieNode *CurrentStack = &NodeStore.front();
+ for (auto *sibling : CurrentStack->ExtraData.siblings)
+ sibling->ExtraData.siblings.push_back(CurrentStack);
+ ParentCallees.push_back(CurrentStack);
+ return CurrentStack;
+}
+
+void writeTraceViewerRecord(raw_ostream &OS, int32_t FuncId, uint32_t TId,
+ bool Symbolize,
+ const FuncIdConversionHelper &FuncIdHelper,
+ double EventTimestampUs,
+ const StackTrieNode &StackCursor,
+ StringRef FunctionPhenotype) {
+ OS << " ";
+ OS << llvm::formatv(
+ R"({ "name" : "{0}", "ph" : "{1}", "tid" : "{2}", "pid" : "1", )"
+ R"("ts" : "{3:f3}", "sf" : "{4}" })",
+ (Symbolize ? FuncIdHelper.SymbolOrNumber(FuncId)
+ : llvm::to_string(FuncId)),
+ FunctionPhenotype, TId, EventTimestampUs, StackCursor.ExtraData.id);
+}
+
+} // namespace
+
+void TraceConverter::exportAsChromeTraceEventFormat(const Trace &Records,
+ raw_ostream &OS) {
+ const auto &FH = Records.getFileHeader();
+ auto CycleFreq = FH.CycleFrequency;
+
+ unsigned id_counter = 0;
+
+ OS << "{\n \"traceEvents\": [";
+ DenseMap<uint32_t, StackTrieNode *> StackCursorByThreadId{};
+ DenseMap<uint32_t, SmallVector<StackTrieNode *, 4>> StackRootsByThreadId{};
+ DenseMap<unsigned, StackTrieNode *> StacksByStackId{};
+ std::forward_list<StackTrieNode> NodeStore{};
+ int loop_count = 0;
+ for (const auto &R : Records) {
+ if (loop_count++ == 0)
+ OS << "\n";
+ else
+ OS << ",\n";
+
+ // Chrome trace event format always wants data in micros.
+ // CyclesPerMicro = CycleHertz / 10^6
+ // TSC / CyclesPerMicro == TSC * 10^6 / CycleHertz == MicroTimestamp
+ // Could lose some precision here by converting the TSC to a double to
+ // multiply by the period in micros. 52 bit mantissa is a good start though.
+ // TODO: Make feature request to Chrome Trace viewer to accept ticks and a
+ // frequency or do some more involved calculation to avoid dangers of
+ // conversion.
+ double EventTimestampUs = double(1000000) / CycleFreq * double(R.TSC);
+ StackTrieNode *&StackCursor = StackCursorByThreadId[R.TId];
+ switch (R.Type) {
+ case RecordTypes::ENTER:
+ case RecordTypes::ENTER_ARG:
+ StackCursor = findOrCreateStackNode(StackCursor, R.FuncId, R.TId,
+ StackRootsByThreadId, StacksByStackId,
+ &id_counter, NodeStore);
+ // Each record is represented as a json dictionary with function name,
+ // type of B for begin or E for end, thread id, process id (faked),
+ // timestamp in microseconds, and a stack frame id. The ids are logged
+ // in an id dictionary after the events.
+ writeTraceViewerRecord(OS, R.FuncId, R.TId, Symbolize, FuncIdHelper,
+ EventTimestampUs, *StackCursor, "B");
+ break;
+ case RecordTypes::EXIT:
+ case RecordTypes::TAIL_EXIT:
+ // No entries to record end for.
+ if (StackCursor == nullptr)
+ break;
+ // Should we emit an END record anyway or account this condition?
+ // (And/Or in loop termination below)
+ StackTrieNode *PreviousCursor = nullptr;
+ do {
+ writeTraceViewerRecord(OS, StackCursor->FuncId, R.TId, Symbolize,
+ FuncIdHelper, EventTimestampUs, *StackCursor,
+ "E");
+ PreviousCursor = StackCursor;
+ StackCursor = StackCursor->Parent;
+ } while (PreviousCursor->FuncId != R.FuncId && StackCursor != nullptr);
+ break;
+ }
+ }
+ OS << "\n ],\n"; // Close the Trace Events array.
+ OS << " "
+ << "\"displayTimeUnit\": \"ns\",\n";
+
+ // The stackFrames dictionary substantially reduces size of the output file by
+ // avoiding repeating the entire call stack of function names for each entry.
+ OS << R"( "stackFrames": {)";
+ int stack_frame_count = 0;
+ for (auto map_iter : StacksByStackId) {
+ if (stack_frame_count++ == 0)
+ OS << "\n";
+ else
+ OS << ",\n";
+ OS << " ";
+ OS << llvm::formatv(
+ R"("{0}" : { "name" : "{1}")", map_iter.first,
+ (Symbolize ? FuncIdHelper.SymbolOrNumber(map_iter.second->FuncId)
+ : llvm::to_string(map_iter.second->FuncId)));
+ if (map_iter.second->Parent != nullptr)
+ OS << llvm::formatv(R"(, "parent": "{0}")",
+ map_iter.second->Parent->ExtraData.id);
+ OS << " }";
+ }
+ OS << "\n }\n"; // Close the stack frames map.
+ OS << "}\n"; // Close the JSON entry.
+}
+
namespace llvm {
namespace xray {
@@ -191,6 +382,9 @@ static CommandRegistration Unused(&Convert, []() -> Error {
case ConvertFormats::BINARY:
TC.exportAsRAWv1(T, OS);
break;
+ case ConvertFormats::CHROME_TRACE_EVENT:
+ TC.exportAsChromeTraceEventFormat(T, OS);
+ break;
}
return Error::success();
});
diff --git a/tools/llvm-xray/xray-converter.h b/tools/llvm-xray/xray-converter.h
index fa0d5e132f1..5f0a3ee298e 100644
--- a/tools/llvm-xray/xray-converter.h
+++ b/tools/llvm-xray/xray-converter.h
@@ -15,8 +15,8 @@
#define LLVM_TOOLS_LLVM_XRAY_XRAY_CONVERTER_H
#include "func-id-helper.h"
-#include "llvm/XRay/XRayRecord.h"
#include "llvm/XRay/Trace.h"
+#include "llvm/XRay/XRayRecord.h"
namespace llvm {
namespace xray {
@@ -31,6 +31,11 @@ public:
void exportAsYAML(const Trace &Records, raw_ostream &OS);
void exportAsRAWv1(const Trace &Records, raw_ostream &OS);
+
+ /// For this conversion, the Function records within each thread are expected
+ /// to be in sorted TSC order. The trace event format encodes stack traces, so
+ /// the linear history is essential for correct output.
+ void exportAsChromeTraceEventFormat(const Trace &Records, raw_ostream &OS);
};
} // namespace xray
diff --git a/tools/llvm-xray/xray-stacks.cc b/tools/llvm-xray/xray-stacks.cc
index fd5df82e093..9474de04799 100644
--- a/tools/llvm-xray/xray-stacks.cc
+++ b/tools/llvm-xray/xray-stacks.cc
@@ -19,6 +19,7 @@
#include <numeric>
#include "func-id-helper.h"
+#include "trie-node.h"
#include "xray-registry.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/CommandLine.h"
@@ -255,96 +256,61 @@ private:
/// maintain an index of unique functions, and provide a means of iterating
/// through all the instrumented call stacks which we know about.
-struct TrieNode {
- int32_t FuncId;
- TrieNode *Parent;
- SmallVector<TrieNode *, 4> Callees;
- // Separate durations depending on whether the node is the deepest node in the
- // stack.
- SmallVector<int64_t, 4> TerminalDurations;
- SmallVector<int64_t, 4> IntermediateDurations;
+struct StackDuration {
+ llvm::SmallVector<int64_t, 4> TerminalDurations;
+ llvm::SmallVector<int64_t, 4> IntermediateDurations;
};
-/// Merges together two TrieNodes with like function ids, aggregating their
-/// callee lists and durations. The caller must provide storage where new merged
-/// nodes can be allocated in the form of a linked list.
-TrieNode *mergeTrieNodes(const TrieNode &Left, const TrieNode &Right,
- TrieNode *NewParent,
- std::forward_list<TrieNode> &NodeStore) {
- assert(Left.FuncId == Right.FuncId);
- NodeStore.push_front(TrieNode{Left.FuncId, NewParent, {}, {}, {}});
- auto I = NodeStore.begin();
- auto *Node = &*I;
-
- // Build a map of callees from the left side.
- DenseMap<int32_t, TrieNode *> LeftCalleesByFuncId;
- for (auto *Callee : Left.Callees) {
- LeftCalleesByFuncId[Callee->FuncId] = Callee;
- }
-
- // Iterate through the right side, either merging with the map values or
- // directly adding to the Callees vector. The iteration also removes any
- // merged values from the left side map.
- for (auto *Callee : Right.Callees) {
- auto iter = LeftCalleesByFuncId.find(Callee->FuncId);
- if (iter != LeftCalleesByFuncId.end()) {
- Node->Callees.push_back(
- mergeTrieNodes(*(iter->second), *Callee, Node, NodeStore));
- LeftCalleesByFuncId.erase(iter);
- } else {
- Node->Callees.push_back(Callee);
- }
- }
-
- // Add any callees that weren't found in the right side.
- for (auto MapPairIter : LeftCalleesByFuncId) {
- Node->Callees.push_back(MapPairIter.second);
- }
-
+StackDuration mergeStackDuration(const StackDuration &Left,
+ const StackDuration &Right) {
+ StackDuration Data{};
+ Data.TerminalDurations.reserve(Left.TerminalDurations.size() +
+ Right.TerminalDurations.size());
+ Data.IntermediateDurations.reserve(Left.IntermediateDurations.size() +
+ Right.IntermediateDurations.size());
// Aggregate the durations.
- for (auto duration : Left.TerminalDurations) {
- Node->TerminalDurations.push_back(duration);
- }
- for (auto duration : Right.TerminalDurations) {
- Node->TerminalDurations.push_back(duration);
- }
- for (auto duration : Left.IntermediateDurations) {
- Node->IntermediateDurations.push_back(duration);
- }
- for (auto duration : Right.IntermediateDurations) {
- Node->IntermediateDurations.push_back(duration);
- }
-
- return Node;
+ for (auto duration : Left.TerminalDurations)
+ Data.TerminalDurations.push_back(duration);
+ for (auto duration : Right.TerminalDurations)
+ Data.TerminalDurations.push_back(duration);
+
+ for (auto duration : Left.IntermediateDurations)
+ Data.IntermediateDurations.push_back(duration);
+ for (auto duration : Right.IntermediateDurations)
+ Data.IntermediateDurations.push_back(duration);
+ return Data;
}
+using StackTrieNode = TrieNode<StackDuration>;
+
template <AggregationType AggType>
-std::size_t GetValueForStack(const TrieNode *Node);
+std::size_t GetValueForStack(const StackTrieNode *Node);
// When computing total time spent in a stack, we're adding the timings from
// its callees and the timings from when it was a leaf.
template <>
std::size_t
-GetValueForStack<AggregationType::TOTAL_TIME>(const TrieNode *Node) {
- auto TopSum = std::accumulate(Node->TerminalDurations.begin(),
- Node->TerminalDurations.end(), 0uLL);
- return std::accumulate(Node->IntermediateDurations.begin(),
- Node->IntermediateDurations.end(), TopSum);
+GetValueForStack<AggregationType::TOTAL_TIME>(const StackTrieNode *Node) {
+ auto TopSum = std::accumulate(Node->ExtraData.TerminalDurations.begin(),
+ Node->ExtraData.TerminalDurations.end(), 0uLL);
+ return std::accumulate(Node->ExtraData.IntermediateDurations.begin(),
+ Node->ExtraData.IntermediateDurations.end(), TopSum);
}
// Calculates how many times a function was invoked.
// TODO: Hook up option to produce stacks
template <>
std::size_t
-GetValueForStack<AggregationType::INVOCATION_COUNT>(const TrieNode *Node) {
- return Node->TerminalDurations.size() + Node->IntermediateDurations.size();
+GetValueForStack<AggregationType::INVOCATION_COUNT>(const StackTrieNode *Node) {
+ return Node->ExtraData.TerminalDurations.size() +
+ Node->ExtraData.IntermediateDurations.size();
}
// Make sure there are implementations for each enum value.
template <AggregationType T> struct DependentFalseType : std::false_type {};
template <AggregationType AggType>
-std::size_t GetValueForStack(const TrieNode *Node) {
+std::size_t GetValueForStack(const StackTrieNode *Node) {
static_assert(DependentFalseType<AggType>::value,
"No implementation found for aggregation type provided.");
return 0;
@@ -353,21 +319,21 @@ std::size_t GetValueForStack(const TrieNode *Node) {
class StackTrie {
// Avoid the magic number of 4 propagated through the code with an alias.
// We use this SmallVector to track the root nodes in a call graph.
- using RootVector = SmallVector<TrieNode *, 4>;
+ using RootVector = SmallVector<StackTrieNode *, 4>;
// We maintain pointers to the roots of the tries we see.
DenseMap<uint32_t, RootVector> Roots;
// We make sure all the nodes are accounted for in this list.
- std::forward_list<TrieNode> NodeStore;
+ std::forward_list<StackTrieNode> NodeStore;
// A map of thread ids to pairs call stack trie nodes and their start times.
- DenseMap<uint32_t, SmallVector<std::pair<TrieNode *, uint64_t>, 8>>
+ DenseMap<uint32_t, SmallVector<std::pair<StackTrieNode *, uint64_t>, 8>>
ThreadStackMap;
- TrieNode *createTrieNode(uint32_t ThreadId, int32_t FuncId,
- TrieNode *Parent) {
- NodeStore.push_front(TrieNode{FuncId, Parent, {}, {}, {}});
+ StackTrieNode *createTrieNode(uint32_t ThreadId, int32_t FuncId,
+ StackTrieNode *Parent) {
+ NodeStore.push_front(StackTrieNode{FuncId, Parent, {}, {{}, {}}});
auto I = NodeStore.begin();
auto *Node = &*I;
if (!Parent)
@@ -375,10 +341,10 @@ class StackTrie {
return Node;
}
- TrieNode *findRootNode(uint32_t ThreadId, int32_t FuncId) {
+ StackTrieNode *findRootNode(uint32_t ThreadId, int32_t FuncId) {
const auto &RootsByThread = Roots[ThreadId];
auto I = find_if(RootsByThread,
- [&](TrieNode *N) { return N->FuncId == FuncId; });
+ [&](StackTrieNode *N) { return N->FuncId == FuncId; });
return (I == RootsByThread.end()) ? nullptr : *I;
}
@@ -416,7 +382,7 @@ public:
auto &Top = TS.back();
auto I = find_if(Top.first->Callees,
- [&](TrieNode *N) { return N->FuncId == R.FuncId; });
+ [&](StackTrieNode *N) { return N->FuncId == R.FuncId; });
if (I == Top.first->Callees.end()) {
// We didn't find the callee in the stack trie, so we're going to
// add to the stack then set up the pointers properly.
@@ -447,8 +413,8 @@ public:
return AccountRecordStatus::ENTRY_NOT_FOUND;
}
- auto FunctionEntryMatch =
- find_if(reverse(TS), [&](const std::pair<TrieNode *, uint64_t> &E) {
+ auto FunctionEntryMatch = find_if(
+ reverse(TS), [&](const std::pair<StackTrieNode *, uint64_t> &E) {
return E.first->FuncId == R.FuncId;
});
auto status = AccountRecordStatus::OK;
@@ -461,14 +427,14 @@ public:
}
auto I = FunctionEntryMatch.base();
for (auto &E : make_range(I, TS.end() - 1))
- E.first->IntermediateDurations.push_back(std::max(E.second, R.TSC) -
- std::min(E.second, R.TSC));
+ E.first->ExtraData.IntermediateDurations.push_back(
+ std::max(E.second, R.TSC) - std::min(E.second, R.TSC));
auto &Deepest = TS.back();
if (wasLastRecordExit)
- Deepest.first->IntermediateDurations.push_back(
+ Deepest.first->ExtraData.IntermediateDurations.push_back(
std::max(Deepest.second, R.TSC) - std::min(Deepest.second, R.TSC));
else
- Deepest.first->TerminalDurations.push_back(
+ Deepest.first->ExtraData.TerminalDurations.push_back(
std::max(Deepest.second, R.TSC) - std::min(Deepest.second, R.TSC));
TS.erase(I, TS.end());
return status;
@@ -479,11 +445,11 @@ public:
bool isEmpty() const { return Roots.empty(); }
- void printStack(raw_ostream &OS, const TrieNode *Top,
+ void printStack(raw_ostream &OS, const StackTrieNode *Top,
FuncIdConversionHelper &FN) {
// Traverse the pointers up to the parent, noting the sums, then print
// in reverse order (callers at top, callees down bottom).
- SmallVector<const TrieNode *, 8> CurrentStack;
+ SmallVector<const StackTrieNode *, 8> CurrentStack;
for (auto *F = Top; F != nullptr; F = F->Parent)
CurrentStack.push_back(F);
int Level = 0;
@@ -491,21 +457,22 @@ public:
"count", "sum");
for (auto *F :
reverse(make_range(CurrentStack.begin() + 1, CurrentStack.end()))) {
- auto Sum = std::accumulate(F->IntermediateDurations.begin(),
- F->IntermediateDurations.end(), 0LL);
+ auto Sum = std::accumulate(F->ExtraData.IntermediateDurations.begin(),
+ F->ExtraData.IntermediateDurations.end(), 0LL);
auto FuncId = FN.SymbolOrNumber(F->FuncId);
OS << formatv("#{0,-4} {1,-60} {2,+12} {3,+16}\n", Level++,
FuncId.size() > 60 ? FuncId.substr(0, 57) + "..." : FuncId,
- F->IntermediateDurations.size(), Sum);
+ F->ExtraData.IntermediateDurations.size(), Sum);
}
auto *Leaf = *CurrentStack.begin();
- auto LeafSum = std::accumulate(Leaf->TerminalDurations.begin(),
- Leaf->TerminalDurations.end(), 0LL);
+ auto LeafSum =
+ std::accumulate(Leaf->ExtraData.TerminalDurations.begin(),
+ Leaf->ExtraData.TerminalDurations.end(), 0LL);
auto LeafFuncId = FN.SymbolOrNumber(Leaf->FuncId);
OS << formatv("#{0,-4} {1,-60} {2,+12} {3,+16}\n", Level++,
LeafFuncId.size() > 60 ? LeafFuncId.substr(0, 57) + "..."
: LeafFuncId,
- Leaf->TerminalDurations.size(), LeafSum);
+ Leaf->ExtraData.TerminalDurations.size(), LeafSum);
OS << "\n";
}
@@ -552,20 +519,20 @@ public:
/// Creates a merged list of Tries for unique stacks that disregards their
/// thread IDs.
- RootVector mergeAcrossThreads(std::forward_list<TrieNode> &NodeStore) {
+ RootVector mergeAcrossThreads(std::forward_list<StackTrieNode> &NodeStore) {
RootVector MergedByThreadRoots;
for (auto MapIter : Roots) {
const auto &RootNodeVector = MapIter.second;
for (auto *Node : RootNodeVector) {
auto MaybeFoundIter =
- find_if(MergedByThreadRoots, [Node](TrieNode *elem) {
+ find_if(MergedByThreadRoots, [Node](StackTrieNode *elem) {
return Node->FuncId == elem->FuncId;
});
if (MaybeFoundIter == MergedByThreadRoots.end()) {
MergedByThreadRoots.push_back(Node);
} else {
- MergedByThreadRoots.push_back(
- mergeTrieNodes(**MaybeFoundIter, *Node, nullptr, NodeStore));
+ MergedByThreadRoots.push_back(mergeTrieNodes(
+ **MaybeFoundIter, *Node, nullptr, NodeStore, mergeStackDuration));
MergedByThreadRoots.erase(MaybeFoundIter);
}
}
@@ -577,7 +544,7 @@ public:
template <AggregationType AggType>
void printAllAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN,
StackOutputFormat format) {
- std::forward_list<TrieNode> AggregatedNodeStore;
+ std::forward_list<StackTrieNode> AggregatedNodeStore;
RootVector MergedByThreadRoots = mergeAcrossThreads(AggregatedNodeStore);
bool reportThreadId = false;
printAll<AggType>(OS, FN, MergedByThreadRoots,
@@ -586,7 +553,7 @@ public:
/// Merges the trie by thread id before printing top stacks.
void printAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN) {
- std::forward_list<TrieNode> AggregatedNodeStore;
+ std::forward_list<StackTrieNode> AggregatedNodeStore;
RootVector MergedByThreadRoots = mergeAcrossThreads(AggregatedNodeStore);
print(OS, FN, MergedByThreadRoots);
}
@@ -595,7 +562,7 @@ public:
template <AggregationType AggType>
void printAll(raw_ostream &OS, FuncIdConversionHelper &FN,
RootVector RootValues, uint32_t ThreadId, bool ReportThread) {
- SmallVector<const TrieNode *, 16> S;
+ SmallVector<const StackTrieNode *, 16> S;
for (const auto *N : RootValues) {
S.clear();
S.push_back(N);
@@ -616,10 +583,10 @@ public:
template <AggregationType AggType>
void printSingleStack(raw_ostream &OS, FuncIdConversionHelper &Converter,
bool ReportThread, uint32_t ThreadId,
- const TrieNode *Node) {
+ const StackTrieNode *Node) {
if (ReportThread)
OS << "thread_" << ThreadId << ";";
- SmallVector<const TrieNode *, 5> lineage{};
+ SmallVector<const StackTrieNode *, 5> lineage{};
lineage.push_back(Node);
while (lineage.back()->Parent != nullptr)
lineage.push_back(lineage.back()->Parent);
@@ -639,15 +606,17 @@ public:
// - Total number of unique stacks
// - Top 10 stacks by count
// - Top 10 stacks by aggregate duration
- SmallVector<std::pair<const TrieNode *, uint64_t>, 11> TopStacksByCount;
- SmallVector<std::pair<const TrieNode *, uint64_t>, 11> TopStacksBySum;
- auto greater_second = [](const std::pair<const TrieNode *, uint64_t> &A,
- const std::pair<const TrieNode *, uint64_t> &B) {
- return A.second > B.second;
- };
+ SmallVector<std::pair<const StackTrieNode *, uint64_t>, 11>
+ TopStacksByCount;
+ SmallVector<std::pair<const StackTrieNode *, uint64_t>, 11> TopStacksBySum;
+ auto greater_second =
+ [](const std::pair<const StackTrieNode *, uint64_t> &A,
+ const std::pair<const StackTrieNode *, uint64_t> &B) {
+ return A.second > B.second;
+ };
uint64_t UniqueStacks = 0;
for (const auto *N : RootValues) {
- SmallVector<const TrieNode *, 16> S;
+ SmallVector<const StackTrieNode *, 16> S;
S.emplace_back(N);
while (!S.empty()) {
@@ -655,10 +624,11 @@ public:
// We only start printing the stack (by walking up the parent pointers)
// when we get to a leaf function.
- if (!Top->TerminalDurations.empty()) {
+ if (!Top->ExtraData.TerminalDurations.empty()) {
++UniqueStacks;
- auto TopSum = std::accumulate(Top->TerminalDurations.begin(),
- Top->TerminalDurations.end(), 0uLL);
+ auto TopSum =
+ std::accumulate(Top->ExtraData.TerminalDurations.begin(),
+ Top->ExtraData.TerminalDurations.end(), 0uLL);
{
auto E = std::make_pair(Top, TopSum);
TopStacksBySum.insert(std::lower_bound(TopStacksBySum.begin(),
@@ -669,7 +639,8 @@ public:
TopStacksBySum.pop_back();
}
{
- auto E = std::make_pair(Top, Top->TerminalDurations.size());
+ auto E =
+ std::make_pair(Top, Top->ExtraData.TerminalDurations.size());
TopStacksByCount.insert(std::lower_bound(TopStacksByCount.begin(),
TopStacksByCount.end(), E,
greater_second),
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index e2fdfe82b8c..0371cd0372f 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -391,6 +391,7 @@ int main(int argc, char **argv) {
initializeTarget(Registry);
// For codegen passes, only passes that do IR to IR transformation are
// supported.
+ initializeExpandMemCmpPassPass(Registry);
initializeScalarizeMaskedMemIntrinPass(Registry);
initializeCodeGenPreparePass(Registry);
initializeAtomicExpandPass(Registry);
diff --git a/unittests/ADT/STLExtrasTest.cpp b/unittests/ADT/STLExtrasTest.cpp
index 2e6eb6f413f..68cd9f5d2c8 100644
--- a/unittests/ADT/STLExtrasTest.cpp
+++ b/unittests/ADT/STLExtrasTest.cpp
@@ -252,12 +252,20 @@ TEST(STLExtrasTest, CountAdaptor) {
EXPECT_EQ(3, count(v, 1));
EXPECT_EQ(2, count(v, 2));
EXPECT_EQ(1, count(v, 3));
- EXPECT_EQ(1, count(v, 4));
-}
-
-TEST(STLExtrasTest, ToVector) {
- std::vector<char> v = {'a', 'b', 'c'};
- auto Enumerated = to_vector<4>(enumerate(v));
+ EXPECT_EQ(1, count(v, 4));
+}
+
+TEST(STLExtrasTest, for_each) {
+ std::vector<int> v{ 0, 1, 2, 3, 4 };
+ int count = 0;
+
+ llvm::for_each(v, [&count](int) { ++count; });
+ EXPECT_EQ(5, count);
+}
+
+TEST(STLExtrasTest, ToVector) {
+ std::vector<char> v = {'a', 'b', 'c'};
+ auto Enumerated = to_vector<4>(enumerate(v));
ASSERT_EQ(3u, Enumerated.size());
for (size_t I = 0; I < v.size(); ++I) {
EXPECT_EQ(I, Enumerated[I].index());
diff --git a/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp b/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp
index 0e881759656..550201ebdd1 100644
--- a/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp
+++ b/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp
@@ -49,66 +49,91 @@ TEST(LegalizerInfoTest, ScalarRISC) {
using namespace TargetOpcode;
LegalizerInfo L;
// Typical RISCy set of operations based on AArch64.
- L.setAction({G_ADD, LLT::scalar(8)}, LegalizerInfo::WidenScalar);
- L.setAction({G_ADD, LLT::scalar(16)}, LegalizerInfo::WidenScalar);
- L.setAction({G_ADD, LLT::scalar(32)}, LegalizerInfo::Legal);
- L.setAction({G_ADD, LLT::scalar(64)}, LegalizerInfo::Legal);
+ for (unsigned Op : {G_ADD, G_SUB}) {
+ for (unsigned Size : {32, 64})
+ L.setAction({Op, 0, LLT::scalar(Size)}, LegalizerInfo::Legal);
+ L.setLegalizeScalarToDifferentSizeStrategy(
+ Op, 0, LegalizerInfo::widenToLargerTypesAndNarrowToLargest);
+ }
+
L.computeTables();
- // Check we infer the correct types and actually do what we're told.
- ASSERT_EQ(L.getAction({G_ADD, LLT::scalar(8)}),
- std::make_pair(LegalizerInfo::WidenScalar, LLT::scalar(32)));
- ASSERT_EQ(L.getAction({G_ADD, LLT::scalar(16)}),
- std::make_pair(LegalizerInfo::WidenScalar, LLT::scalar(32)));
- ASSERT_EQ(L.getAction({G_ADD, LLT::scalar(32)}),
- std::make_pair(LegalizerInfo::Legal, LLT::scalar(32)));
- ASSERT_EQ(L.getAction({G_ADD, LLT::scalar(64)}),
- std::make_pair(LegalizerInfo::Legal, LLT::scalar(64)));
-
- // Make sure the default for over-sized types applies.
- ASSERT_EQ(L.getAction({G_ADD, LLT::scalar(128)}),
- std::make_pair(LegalizerInfo::NarrowScalar, LLT::scalar(64)));
+ for (unsigned opcode : {G_ADD, G_SUB}) {
+ // Check we infer the correct types and actually do what we're told.
+ ASSERT_EQ(L.getAction({opcode, LLT::scalar(8)}),
+ std::make_pair(LegalizerInfo::WidenScalar, LLT::scalar(32)));
+ ASSERT_EQ(L.getAction({opcode, LLT::scalar(16)}),
+ std::make_pair(LegalizerInfo::WidenScalar, LLT::scalar(32)));
+ ASSERT_EQ(L.getAction({opcode, LLT::scalar(32)}),
+ std::make_pair(LegalizerInfo::Legal, LLT::scalar(32)));
+ ASSERT_EQ(L.getAction({opcode, LLT::scalar(64)}),
+ std::make_pair(LegalizerInfo::Legal, LLT::scalar(64)));
+
+ // Make sure the default for over-sized types applies.
+ ASSERT_EQ(L.getAction({opcode, LLT::scalar(128)}),
+ std::make_pair(LegalizerInfo::NarrowScalar, LLT::scalar(64)));
+ // Make sure we also handle unusual sizes
+ ASSERT_EQ(L.getAction({opcode, LLT::scalar(1)}),
+ std::make_pair(LegalizerInfo::WidenScalar, LLT::scalar(32)));
+ ASSERT_EQ(L.getAction({opcode, LLT::scalar(31)}),
+ std::make_pair(LegalizerInfo::WidenScalar, LLT::scalar(32)));
+ ASSERT_EQ(L.getAction({opcode, LLT::scalar(33)}),
+ std::make_pair(LegalizerInfo::WidenScalar, LLT::scalar(64)));
+ ASSERT_EQ(L.getAction({opcode, LLT::scalar(63)}),
+ std::make_pair(LegalizerInfo::WidenScalar, LLT::scalar(64)));
+ ASSERT_EQ(L.getAction({opcode, LLT::scalar(65)}),
+ std::make_pair(LegalizerInfo::NarrowScalar, LLT::scalar(64)));
+ }
}
TEST(LegalizerInfoTest, VectorRISC) {
using namespace TargetOpcode;
LegalizerInfo L;
// Typical RISCy set of operations based on ARM.
- L.setScalarInVectorAction(G_ADD, LLT::scalar(8), LegalizerInfo::Legal);
- L.setScalarInVectorAction(G_ADD, LLT::scalar(16), LegalizerInfo::Legal);
- L.setScalarInVectorAction(G_ADD, LLT::scalar(32), LegalizerInfo::Legal);
-
L.setAction({G_ADD, LLT::vector(8, 8)}, LegalizerInfo::Legal);
L.setAction({G_ADD, LLT::vector(16, 8)}, LegalizerInfo::Legal);
L.setAction({G_ADD, LLT::vector(4, 16)}, LegalizerInfo::Legal);
L.setAction({G_ADD, LLT::vector(8, 16)}, LegalizerInfo::Legal);
L.setAction({G_ADD, LLT::vector(2, 32)}, LegalizerInfo::Legal);
L.setAction({G_ADD, LLT::vector(4, 32)}, LegalizerInfo::Legal);
+
+ L.setLegalizeVectorElementToDifferentSizeStrategy(
+ G_ADD, 0, LegalizerInfo::widenToLargerTypesUnsupportedOtherwise);
+
+ L.setAction({G_ADD, 0, LLT::scalar(32)}, LegalizerInfo::Legal);
+
L.computeTables();
// Check we infer the correct types and actually do what we're told for some
// simple cases.
- ASSERT_EQ(L.getAction({G_ADD, LLT::vector(2, 8)}),
- std::make_pair(LegalizerInfo::MoreElements, LLT::vector(8, 8)));
ASSERT_EQ(L.getAction({G_ADD, LLT::vector(8, 8)}),
std::make_pair(LegalizerInfo::Legal, LLT::vector(8, 8)));
- ASSERT_EQ(
- L.getAction({G_ADD, LLT::vector(8, 32)}),
- std::make_pair(LegalizerInfo::FewerElements, LLT::vector(4, 32)));
+ ASSERT_EQ(L.getAction({G_ADD, LLT::vector(8, 7)}),
+ std::make_pair(LegalizerInfo::WidenScalar, LLT::vector(8, 8)));
+ ASSERT_EQ(L.getAction({G_ADD, LLT::vector(2, 8)}),
+ std::make_pair(LegalizerInfo::MoreElements, LLT::vector(8, 8)));
+ ASSERT_EQ(L.getAction({G_ADD, LLT::vector(8, 32)}),
+ std::make_pair(LegalizerInfo::FewerElements, LLT::vector(4, 32)));
+ // Check a few non-power-of-2 sizes:
+ ASSERT_EQ(L.getAction({G_ADD, LLT::vector(3, 3)}),
+ std::make_pair(LegalizerInfo::WidenScalar, LLT::vector(3, 8)));
+ ASSERT_EQ(L.getAction({G_ADD, LLT::vector(3, 8)}),
+ std::make_pair(LegalizerInfo::MoreElements, LLT::vector(8, 8)));
}
TEST(LegalizerInfoTest, MultipleTypes) {
using namespace TargetOpcode;
LegalizerInfo L;
LLT p0 = LLT::pointer(0, 64);
- LLT s32 = LLT::scalar(32);
LLT s64 = LLT::scalar(64);
// Typical RISCy set of operations based on AArch64.
L.setAction({G_PTRTOINT, 0, s64}, LegalizerInfo::Legal);
L.setAction({G_PTRTOINT, 1, p0}, LegalizerInfo::Legal);
- L.setAction({G_PTRTOINT, 0, s32}, LegalizerInfo::WidenScalar);
+ L.setLegalizeScalarToDifferentSizeStrategy(
+ G_PTRTOINT, 0, LegalizerInfo::widenToLargerTypesAndNarrowToLargest);
+
L.computeTables();
// Check we infer the correct types and actually do what we're told.
@@ -116,16 +141,21 @@ TEST(LegalizerInfoTest, MultipleTypes) {
std::make_pair(LegalizerInfo::Legal, s64));
ASSERT_EQ(L.getAction({G_PTRTOINT, 1, p0}),
std::make_pair(LegalizerInfo::Legal, p0));
+ // Make sure we also handle unusual sizes
+ ASSERT_EQ(L.getAction({G_PTRTOINT, 0, LLT::scalar(65)}),
+ std::make_pair(LegalizerInfo::NarrowScalar, s64));
+ ASSERT_EQ(L.getAction({G_PTRTOINT, 1, LLT::pointer(0, 32)}),
+ std::make_pair(LegalizerInfo::Unsupported, LLT::pointer(0, 32)));
}
TEST(LegalizerInfoTest, MultipleSteps) {
using namespace TargetOpcode;
LegalizerInfo L;
- LLT s16 = LLT::scalar(16);
LLT s32 = LLT::scalar(32);
LLT s64 = LLT::scalar(64);
- L.setAction({G_UREM, 0, s16}, LegalizerInfo::WidenScalar);
+ L.setLegalizeScalarToDifferentSizeStrategy(
+ G_UREM, 0, LegalizerInfo::widenToLargerTypesUnsupportedOtherwise);
L.setAction({G_UREM, 0, s32}, LegalizerInfo::Lower);
L.setAction({G_UREM, 0, s64}, LegalizerInfo::Lower);
@@ -136,4 +166,33 @@ TEST(LegalizerInfoTest, MultipleSteps) {
ASSERT_EQ(L.getAction({G_UREM, LLT::scalar(32)}),
std::make_pair(LegalizerInfo::Lower, LLT::scalar(32)));
}
+
+TEST(LegalizerInfoTest, SizeChangeStrategy) {
+ using namespace TargetOpcode;
+ LegalizerInfo L;
+ for (unsigned Size : {1, 8, 16, 32})
+ L.setAction({G_UREM, 0, LLT::scalar(Size)}, LegalizerInfo::Legal);
+
+ L.setLegalizeScalarToDifferentSizeStrategy(
+ G_UREM, 0, LegalizerInfo::widenToLargerTypesUnsupportedOtherwise);
+ L.computeTables();
+
+ // Check we infer the correct types and actually do what we're told.
+ for (unsigned Size : {1, 8, 16, 32}) {
+ ASSERT_EQ(L.getAction({G_UREM, LLT::scalar(Size)}),
+ std::make_pair(LegalizerInfo::Legal, LLT::scalar(Size)));
+ }
+ ASSERT_EQ(L.getAction({G_UREM, LLT::scalar(2)}),
+ std::make_pair(LegalizerInfo::WidenScalar, LLT::scalar(8)));
+ ASSERT_EQ(L.getAction({G_UREM, LLT::scalar(7)}),
+ std::make_pair(LegalizerInfo::WidenScalar, LLT::scalar(8)));
+ ASSERT_EQ(L.getAction({G_UREM, LLT::scalar(9)}),
+ std::make_pair(LegalizerInfo::WidenScalar, LLT::scalar(16)));
+ ASSERT_EQ(L.getAction({G_UREM, LLT::scalar(17)}),
+ std::make_pair(LegalizerInfo::WidenScalar, LLT::scalar(32)));
+ ASSERT_EQ(L.getAction({G_UREM, LLT::scalar(31)}),
+ std::make_pair(LegalizerInfo::WidenScalar, LLT::scalar(32)));
+ ASSERT_EQ(L.getAction({G_UREM, LLT::scalar(33)}),
+ std::make_pair(LegalizerInfo::Unsupported, LLT::scalar(33)));
+}
}
diff --git a/unittests/CodeGen/LowLevelTypeTest.cpp b/unittests/CodeGen/LowLevelTypeTest.cpp
index 11555464290..a4765d99856 100644
--- a/unittests/CodeGen/LowLevelTypeTest.cpp
+++ b/unittests/CodeGen/LowLevelTypeTest.cpp
@@ -36,36 +36,22 @@ TEST(LowLevelTypeTest, Scalar) {
for (unsigned S : {1U, 17U, 32U, 64U, 0xfffffU}) {
const LLT Ty = LLT::scalar(S);
- const LLT HalfTy = (S % 2) == 0 ? Ty.halfScalarSize() : Ty;
- const LLT DoubleTy = Ty.doubleScalarSize();
// Test kind.
- for (const LLT TestTy : {Ty, HalfTy, DoubleTy}) {
- ASSERT_TRUE(TestTy.isValid());
- ASSERT_TRUE(TestTy.isScalar());
+ ASSERT_TRUE(Ty.isValid());
+ ASSERT_TRUE(Ty.isScalar());
- ASSERT_FALSE(TestTy.isPointer());
- ASSERT_FALSE(TestTy.isVector());
- }
+ ASSERT_FALSE(Ty.isPointer());
+ ASSERT_FALSE(Ty.isVector());
// Test sizes.
EXPECT_EQ(S, Ty.getSizeInBits());
EXPECT_EQ(S, Ty.getScalarSizeInBits());
- EXPECT_EQ(S*2, DoubleTy.getSizeInBits());
- EXPECT_EQ(S*2, DoubleTy.getScalarSizeInBits());
-
- if ((S % 2) == 0) {
- EXPECT_EQ(S/2, HalfTy.getSizeInBits());
- EXPECT_EQ(S/2, HalfTy.getScalarSizeInBits());
- }
-
// Test equality operators.
EXPECT_TRUE(Ty == Ty);
EXPECT_FALSE(Ty != Ty);
- EXPECT_NE(Ty, DoubleTy);
-
// Test Type->LLT conversion.
Type *IRTy = IntegerType::get(C, S);
EXPECT_EQ(Ty, getLLTForType(*IRTy, DL));
@@ -90,62 +76,18 @@ TEST(LowLevelTypeTest, Vector) {
// Test getElementType().
EXPECT_EQ(STy, VTy.getElementType());
- const LLT HalfSzTy = ((S % 2) == 0) ? VTy.halfScalarSize() : VTy;
- const LLT DoubleSzTy = VTy.doubleScalarSize();
-
- // halfElements requires an even number of elements.
- const LLT HalfEltIfEvenTy = ((Elts % 2) == 0) ? VTy.halfElements() : VTy;
- const LLT DoubleEltTy = VTy.doubleElements();
-
// Test kind.
- for (const LLT TestTy : {VTy, HalfSzTy, DoubleSzTy, DoubleEltTy}) {
- ASSERT_TRUE(TestTy.isValid());
- ASSERT_TRUE(TestTy.isVector());
-
- ASSERT_FALSE(TestTy.isScalar());
- ASSERT_FALSE(TestTy.isPointer());
- }
-
- // Test halving elements to a scalar.
- {
- ASSERT_TRUE(HalfEltIfEvenTy.isValid());
- ASSERT_FALSE(HalfEltIfEvenTy.isPointer());
- if (Elts > 2) {
- ASSERT_TRUE(HalfEltIfEvenTy.isVector());
- } else {
- ASSERT_FALSE(HalfEltIfEvenTy.isVector());
- EXPECT_EQ(STy, HalfEltIfEvenTy);
- }
- }
+ ASSERT_TRUE(VTy.isValid());
+ ASSERT_TRUE(VTy.isVector());
+ ASSERT_FALSE(VTy.isScalar());
+ ASSERT_FALSE(VTy.isPointer());
// Test sizes.
EXPECT_EQ(S * Elts, VTy.getSizeInBits());
EXPECT_EQ(S, VTy.getScalarSizeInBits());
EXPECT_EQ(Elts, VTy.getNumElements());
- if ((S % 2) == 0) {
- EXPECT_EQ((S / 2) * Elts, HalfSzTy.getSizeInBits());
- EXPECT_EQ(S / 2, HalfSzTy.getScalarSizeInBits());
- EXPECT_EQ(Elts, HalfSzTy.getNumElements());
- }
-
- EXPECT_EQ((S * 2) * Elts, DoubleSzTy.getSizeInBits());
- EXPECT_EQ(S * 2, DoubleSzTy.getScalarSizeInBits());
- EXPECT_EQ(Elts, DoubleSzTy.getNumElements());
-
- if ((Elts % 2) == 0) {
- EXPECT_EQ(S * (Elts / 2), HalfEltIfEvenTy.getSizeInBits());
- EXPECT_EQ(S, HalfEltIfEvenTy.getScalarSizeInBits());
- if (Elts > 2) {
- EXPECT_EQ(Elts / 2, HalfEltIfEvenTy.getNumElements());
- }
- }
-
- EXPECT_EQ(S * (Elts * 2), DoubleEltTy.getSizeInBits());
- EXPECT_EQ(S, DoubleEltTy.getScalarSizeInBits());
- EXPECT_EQ(Elts * 2, DoubleEltTy.getNumElements());
-
// Test equality operators.
EXPECT_TRUE(VTy == VTy);
EXPECT_FALSE(VTy != VTy);
@@ -153,10 +95,6 @@ TEST(LowLevelTypeTest, Vector) {
// Test inequality operators on..
// ..different kind.
EXPECT_NE(VTy, STy);
- // ..different #elts.
- EXPECT_NE(VTy, DoubleEltTy);
- // ..different scalar size.
- EXPECT_NE(VTy, DoubleSzTy);
// Test Type->LLT conversion.
Type *IRSTy = IntegerType::get(C, S);
diff --git a/unittests/CodeGen/MachineInstrTest.cpp b/unittests/CodeGen/MachineInstrTest.cpp
index 89041e2ab22..808890e175d 100644
--- a/unittests/CodeGen/MachineInstrTest.cpp
+++ b/unittests/CodeGen/MachineInstrTest.cpp
@@ -10,10 +10,10 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/unittests/DebugInfo/DWARF/DWARFFormValueTest.cpp b/unittests/DebugInfo/DWARF/DWARFFormValueTest.cpp
index f283ac0961c..14550b9082b 100644
--- a/unittests/DebugInfo/DWARF/DWARFFormValueTest.cpp
+++ b/unittests/DebugInfo/DWARF/DWARFFormValueTest.cpp
@@ -99,7 +99,7 @@ DWARFFormValue createDataXFormValue(dwarf::Form Form, RawTypeT Value) {
DWARFFormValue Result(Form);
DWARFDataExtractor Data(StringRef(Raw, sizeof(RawTypeT)),
sys::IsLittleEndianHost, sizeof(void *));
- Result.extractValue(Data, &Offset, nullptr);
+ Result.extractValue(Data, &Offset, {0, 0, dwarf::DwarfFormat::DWARF32});
return Result;
}
@@ -110,7 +110,7 @@ DWARFFormValue createULEBFormValue(uint64_t Value) {
uint32_t Offset = 0;
DWARFFormValue Result(DW_FORM_udata);
DWARFDataExtractor Data(OS.str(), sys::IsLittleEndianHost, sizeof(void *));
- Result.extractValue(Data, &Offset, nullptr);
+ Result.extractValue(Data, &Offset, {0, 0, dwarf::DwarfFormat::DWARF32});
return Result;
}
@@ -121,7 +121,7 @@ DWARFFormValue createSLEBFormValue(int64_t Value) {
uint32_t Offset = 0;
DWARFFormValue Result(DW_FORM_sdata);
DWARFDataExtractor Data(OS.str(), sys::IsLittleEndianHost, sizeof(void *));
- Result.extractValue(Data, &Offset, nullptr);
+ Result.extractValue(Data, &Offset, {0, 0, dwarf::DwarfFormat::DWARF32});
return Result;
}
diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index d361107cc0d..bb74756d81a 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp
@@ -144,17 +144,40 @@ TEST_F(IRBuilderTest, FastMathFlags) {
FastMathFlags FMF;
Builder.setFastMathFlags(FMF);
+ // By default, no flags are set.
F = Builder.CreateFAdd(F, F);
EXPECT_FALSE(Builder.getFastMathFlags().any());
+ ASSERT_TRUE(isa<Instruction>(F));
+ FAdd = cast<Instruction>(F);
+ EXPECT_FALSE(FAdd->hasNoNaNs());
+ EXPECT_FALSE(FAdd->hasNoInfs());
+ EXPECT_FALSE(FAdd->hasNoSignedZeros());
+ EXPECT_FALSE(FAdd->hasAllowReciprocal());
+ EXPECT_FALSE(FAdd->hasAllowContract());
+ EXPECT_FALSE(FAdd->hasAllowReassoc());
+ EXPECT_FALSE(FAdd->hasApproxFunc());
- FMF.setUnsafeAlgebra();
+ // Set all flags in the instruction.
+ FAdd->setFast(true);
+ EXPECT_TRUE(FAdd->hasNoNaNs());
+ EXPECT_TRUE(FAdd->hasNoInfs());
+ EXPECT_TRUE(FAdd->hasNoSignedZeros());
+ EXPECT_TRUE(FAdd->hasAllowReciprocal());
+ EXPECT_TRUE(FAdd->hasAllowContract());
+ EXPECT_TRUE(FAdd->hasAllowReassoc());
+ EXPECT_TRUE(FAdd->hasApproxFunc());
+
+ // All flags are set in the builder.
+ FMF.setFast();
Builder.setFastMathFlags(FMF);
F = Builder.CreateFAdd(F, F);
EXPECT_TRUE(Builder.getFastMathFlags().any());
+ EXPECT_TRUE(Builder.getFastMathFlags().all());
ASSERT_TRUE(isa<Instruction>(F));
FAdd = cast<Instruction>(F);
EXPECT_TRUE(FAdd->hasNoNaNs());
+ EXPECT_TRUE(FAdd->isFast());
// Now, try it with CreateBinOp
F = Builder.CreateBinOp(Instruction::FAdd, F, F);
@@ -162,21 +185,23 @@ TEST_F(IRBuilderTest, FastMathFlags) {
ASSERT_TRUE(isa<Instruction>(F));
FAdd = cast<Instruction>(F);
EXPECT_TRUE(FAdd->hasNoNaNs());
+ EXPECT_TRUE(FAdd->isFast());
F = Builder.CreateFDiv(F, F);
- EXPECT_TRUE(Builder.getFastMathFlags().any());
- EXPECT_TRUE(Builder.getFastMathFlags().UnsafeAlgebra);
+ EXPECT_TRUE(Builder.getFastMathFlags().all());
ASSERT_TRUE(isa<Instruction>(F));
FDiv = cast<Instruction>(F);
EXPECT_TRUE(FDiv->hasAllowReciprocal());
+ // Clear all FMF in the builder.
Builder.clearFastMathFlags();
F = Builder.CreateFDiv(F, F);
ASSERT_TRUE(isa<Instruction>(F));
FDiv = cast<Instruction>(F);
EXPECT_FALSE(FDiv->hasAllowReciprocal());
-
+
+ // Try individual flags.
FMF.clear();
FMF.setAllowReciprocal();
Builder.setFastMathFlags(FMF);
@@ -225,7 +250,25 @@ TEST_F(IRBuilderTest, FastMathFlags) {
FAdd = cast<Instruction>(FC);
EXPECT_TRUE(FAdd->hasAllowContract());
+ FMF.setApproxFunc();
+ Builder.clearFastMathFlags();
+ Builder.setFastMathFlags(FMF);
+ // Now 'aml' and 'contract' are set.
+ F = Builder.CreateFMul(F, F);
+ FAdd = cast<Instruction>(F);
+ EXPECT_TRUE(FAdd->hasApproxFunc());
+ EXPECT_TRUE(FAdd->hasAllowContract());
+ EXPECT_FALSE(FAdd->hasAllowReassoc());
+
+ FMF.setAllowReassoc();
Builder.clearFastMathFlags();
+ Builder.setFastMathFlags(FMF);
+ // Now 'aml' and 'contract' and 'reassoc' are set.
+ F = Builder.CreateFMul(F, F);
+ FAdd = cast<Instruction>(F);
+ EXPECT_TRUE(FAdd->hasApproxFunc());
+ EXPECT_TRUE(FAdd->hasAllowContract());
+ EXPECT_TRUE(FAdd->hasAllowReassoc());
// Test a call with FMF.
auto CalleeTy = FunctionType::get(Type::getFloatTy(Ctx),
diff --git a/unittests/Support/DynamicLibrary/CMakeLists.txt b/unittests/Support/DynamicLibrary/CMakeLists.txt
index b5844381362..9355979221a 100644
--- a/unittests/Support/DynamicLibrary/CMakeLists.txt
+++ b/unittests/Support/DynamicLibrary/CMakeLists.txt
@@ -1,6 +1,7 @@
set(LLVM_LINK_COMPONENTS Support)
add_library(DynamicLibraryLib STATIC ExportedFuncs.cxx)
+set_target_properties(DynamicLibraryLib PROPERTIES FOLDER "Tests")
add_llvm_unittest(DynamicLibraryTests DynamicLibraryTest.cpp)
target_link_libraries(DynamicLibraryTests DynamicLibraryLib)
@@ -8,6 +9,7 @@ export_executable_symbols(DynamicLibraryTests)
function(dynlib_add_module NAME)
add_library(${NAME} SHARED PipSqueak.cxx)
+ set_target_properties(${NAME} PROPERTIES FOLDER "Tests")
set_output_directory(${NAME}
BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}
diff --git a/unittests/Support/FileOutputBufferTest.cpp b/unittests/Support/FileOutputBufferTest.cpp
index 640af46844b..e7f1fd765bd 100644
--- a/unittests/Support/FileOutputBufferTest.cpp
+++ b/unittests/Support/FileOutputBufferTest.cpp
@@ -42,16 +42,16 @@ TEST(FileOutputBuffer, Test) {
SmallString<128> File1(TestDirectory);
File1.append("/file1");
{
- ErrorOr<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
+ Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
FileOutputBuffer::create(File1, 8192);
- ASSERT_NO_ERROR(BufferOrErr.getError());
+ ASSERT_NO_ERROR(errorToErrorCode(BufferOrErr.takeError()));
std::unique_ptr<FileOutputBuffer> &Buffer = *BufferOrErr;
// Start buffer with special header.
memcpy(Buffer->getBufferStart(), "AABBCCDDEEFFGGHHIIJJ", 20);
// Write to end of buffer to verify it is writable.
memcpy(Buffer->getBufferEnd() - 20, "AABBCCDDEEFFGGHHIIJJ", 20);
// Commit buffer.
- ASSERT_NO_ERROR(Buffer->commit());
+ ASSERT_NO_ERROR(errorToErrorCode(Buffer->commit()));
}
// Verify file is correct size.
@@ -64,9 +64,9 @@ TEST(FileOutputBuffer, Test) {
SmallString<128> File2(TestDirectory);
File2.append("/file2");
{
- ErrorOr<std::unique_ptr<FileOutputBuffer>> Buffer2OrErr =
+ Expected<std::unique_ptr<FileOutputBuffer>> Buffer2OrErr =
FileOutputBuffer::create(File2, 8192);
- ASSERT_NO_ERROR(Buffer2OrErr.getError());
+ ASSERT_NO_ERROR(errorToErrorCode(Buffer2OrErr.takeError()));
std::unique_ptr<FileOutputBuffer> &Buffer2 = *Buffer2OrErr;
// Fill buffer with special header.
memcpy(Buffer2->getBufferStart(), "AABBCCDDEEFFGGHHIIJJ", 20);
@@ -81,15 +81,15 @@ TEST(FileOutputBuffer, Test) {
SmallString<128> File3(TestDirectory);
File3.append("/file3");
{
- ErrorOr<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
+ Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
FileOutputBuffer::create(File3, 8192000);
- ASSERT_NO_ERROR(BufferOrErr.getError());
+ ASSERT_NO_ERROR(errorToErrorCode(BufferOrErr.takeError()));
std::unique_ptr<FileOutputBuffer> &Buffer = *BufferOrErr;
// Start buffer with special header.
memcpy(Buffer->getBufferStart(), "AABBCCDDEEFFGGHHIIJJ", 20);
// Write to end of buffer to verify it is writable.
memcpy(Buffer->getBufferEnd() - 20, "AABBCCDDEEFFGGHHIIJJ", 20);
- ASSERT_NO_ERROR(Buffer->commit());
+ ASSERT_NO_ERROR(errorToErrorCode(Buffer->commit()));
}
// Verify file is correct size.
@@ -102,14 +102,14 @@ TEST(FileOutputBuffer, Test) {
SmallString<128> File4(TestDirectory);
File4.append("/file4");
{
- ErrorOr<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
+ Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
FileOutputBuffer::create(File4, 8192, FileOutputBuffer::F_executable);
- ASSERT_NO_ERROR(BufferOrErr.getError());
+ ASSERT_NO_ERROR(errorToErrorCode(BufferOrErr.takeError()));
std::unique_ptr<FileOutputBuffer> &Buffer = *BufferOrErr;
// Start buffer with special header.
memcpy(Buffer->getBufferStart(), "AABBCCDDEEFFGGHHIIJJ", 20);
// Commit buffer.
- ASSERT_NO_ERROR(Buffer->commit());
+ ASSERT_NO_ERROR(errorToErrorCode(Buffer->commit()));
}
// Verify file exists and is executable.
fs::file_status Status;
diff --git a/unittests/Support/SpecialCaseListTest.cpp b/unittests/Support/SpecialCaseListTest.cpp
index 9e1223b5497..060703e102f 100644
--- a/unittests/Support/SpecialCaseListTest.cpp
+++ b/unittests/Support/SpecialCaseListTest.cpp
@@ -58,6 +58,30 @@ TEST_F(SpecialCaseListTest, Basic) {
EXPECT_FALSE(SCL->inSection("", "src", "hi"));
EXPECT_FALSE(SCL->inSection("", "fun", "hello"));
EXPECT_FALSE(SCL->inSection("", "src", "hello", "category"));
+
+ EXPECT_EQ(3u, SCL->inSectionBlame("", "src", "hello"));
+ EXPECT_EQ(4u, SCL->inSectionBlame("", "src", "bye"));
+ EXPECT_EQ(5u, SCL->inSectionBlame("", "src", "hi", "category"));
+ EXPECT_EQ(6u, SCL->inSectionBlame("", "src", "zzzz", "category"));
+ EXPECT_EQ(0u, SCL->inSectionBlame("", "src", "hi"));
+ EXPECT_EQ(0u, SCL->inSectionBlame("", "fun", "hello"));
+ EXPECT_EQ(0u, SCL->inSectionBlame("", "src", "hello", "category"));
+}
+
+TEST_F(SpecialCaseListTest, CorrectErrorLineNumberWithBlankLine) {
+ std::string Error;
+ EXPECT_EQ(nullptr, makeSpecialCaseList("# This is a comment.\n"
+ "\n"
+ "[not valid\n",
+ Error));
+ EXPECT_TRUE(
+ ((StringRef)Error).startswith("malformed section header on line 3:"));
+
+ EXPECT_EQ(nullptr, makeSpecialCaseList("\n\n\n"
+ "[not valid\n",
+ Error));
+ EXPECT_TRUE(
+ ((StringRef)Error).startswith("malformed section header on line 4:"));
}
TEST_F(SpecialCaseListTest, SectionRegexErrorHandling) {
diff --git a/unittests/tools/llvm-cfi-verify/CMakeLists.txt b/unittests/tools/llvm-cfi-verify/CMakeLists.txt
index ad3266c2777..adb7a55327a 100644
--- a/unittests/tools/llvm-cfi-verify/CMakeLists.txt
+++ b/unittests/tools/llvm-cfi-verify/CMakeLists.txt
@@ -8,6 +8,7 @@ set(LLVM_LINK_COMPONENTS
MCParser
Object
Support
+ Symbolize
)
add_llvm_unittest(CFIVerifyTests
diff --git a/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp b/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp
index 0df468e8995..00346ab5a14 100644
--- a/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp
+++ b/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp
@@ -64,6 +64,7 @@ public:
class BasicFileAnalysisTest : public ::testing::Test {
protected:
virtual void SetUp() {
+ IgnoreDWARFFlag = true;
SuccessfullyInitialised = true;
if (auto Err = Analysis.initialiseDisassemblyMembers()) {
handleAllErrors(std::move(Err), [&](const UnsupportedDisassembly &E) {
@@ -650,7 +651,60 @@ TEST_F(BasicFileAnalysisTest, CFIProtectionComplexExample) {
0x0f, 0x0b, // 22: ud2
},
0xDEADBEEF);
+ uint64_t PrevSearchLengthForUndef = SearchLengthForUndef;
+ SearchLengthForUndef = 5;
EXPECT_FALSE(Analysis.isIndirectInstructionCFIProtected(0xDEADBEEF + 9));
+ SearchLengthForUndef = PrevSearchLengthForUndef;
+}
+
+TEST_F(BasicFileAnalysisTest, UndefSearchLengthOneTest) {
+ Analysis.parseSectionContents(
+ {
+ 0x77, 0x0d, // 0x688118: ja 0x688127 [+12]
+ 0x48, 0x89, 0xdf, // 0x68811a: mov %rbx, %rdi
+ 0xff, 0xd0, // 0x68811d: callq *%rax
+ 0x48, 0x89, 0xdf, // 0x68811f: mov %rbx, %rdi
+ 0xe8, 0x09, 0x00, 0x00, 0x00, // 0x688122: callq 0x688130
+ 0x0f, 0x0b, // 0x688127: ud2
+ },
+ 0x688118);
+ uint64_t PrevSearchLengthForUndef = SearchLengthForUndef;
+ SearchLengthForUndef = 1;
+ EXPECT_TRUE(Analysis.isIndirectInstructionCFIProtected(0x68811d));
+ SearchLengthForUndef = PrevSearchLengthForUndef;
+}
+
+TEST_F(BasicFileAnalysisTest, UndefSearchLengthOneTestFarAway) {
+ Analysis.parseSectionContents(
+ {
+ 0x74, 0x73, // 0x7759eb: je 0x775a60
+ 0xe9, 0x1c, 0x04, 0x00, 0x00, 0x00, // 0x7759ed: jmpq 0x775e0e
+ },
+ 0x7759eb);
+
+ Analysis.parseSectionContents(
+ {
+ 0x0f, 0x85, 0xb2, 0x03, 0x00, 0x00, // 0x775a56: jne 0x775e0e
+ 0x48, 0x83, 0xc3, 0xf4, // 0x775a5c: add $0xfffffffffffffff4,%rbx
+ 0x48, 0x8b, 0x7c, 0x24, 0x10, // 0x775a60: mov 0x10(%rsp),%rdi
+ 0x48, 0x89, 0xde, // 0x775a65: mov %rbx,%rsi
+ 0xff, 0xd1, // 0x775a68: callq *%rcx
+ },
+ 0x775a56);
+
+ Analysis.parseSectionContents(
+ {
+ 0x0f, 0x0b, // 0x775e0e: ud2
+ },
+ 0x775e0e);
+ uint64_t PrevSearchLengthForUndef = SearchLengthForUndef;
+ SearchLengthForUndef = 1;
+ EXPECT_FALSE(Analysis.isIndirectInstructionCFIProtected(0x775a68));
+ SearchLengthForUndef = 2;
+ EXPECT_TRUE(Analysis.isIndirectInstructionCFIProtected(0x775a68));
+ SearchLengthForUndef = 3;
+ EXPECT_TRUE(Analysis.isIndirectInstructionCFIProtected(0x775a68));
+ SearchLengthForUndef = PrevSearchLengthForUndef;
}
} // anonymous namespace
diff --git a/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp b/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp
index b200677dd09..a7d09b54781 100644
--- a/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp
+++ b/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp
@@ -126,6 +126,7 @@ public:
class BasicGraphBuilderTest : public ::testing::Test {
protected:
virtual void SetUp() {
+ IgnoreDWARFFlag = true;
SuccessfullyInitialised = true;
if (auto Err = Analysis.initialiseDisassemblyMembers()) {
handleAllErrors(std::move(Err), [&](const UnsupportedDisassembly &E) {
diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index 9d808cc875c..7db97301637 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp
@@ -62,6 +62,10 @@ static cl::list<std::string> ImplicitCheckNot(
"this pattern occur which are not matched by a positive pattern"),
cl::value_desc("pattern"));
+static cl::list<std::string> GlobalDefines("D", cl::Prefix,
+ cl::desc("Define a variable to be used in capture patterns."),
+ cl::value_desc("VAR=VALUE"));
+
static cl::opt<bool> AllowEmptyInput(
"allow-empty", cl::init(false),
cl::desc("Allow the input file to be empty. This is useful when making\n"
@@ -1295,6 +1299,9 @@ bool CheckInput(SourceMgr &SM, StringRef Buffer,
/// VariableTable - This holds all the current filecheck variables.
StringMap<StringRef> VariableTable;
+ for (const auto& Def : GlobalDefines)
+ VariableTable.insert(StringRef(Def).split('='));
+
unsigned i = 0, j = 0, e = CheckStrings.size();
while (true) {
StringRef CheckRegion;
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index 86ff203654d..e9e6dff086a 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -43,3 +43,4 @@ add_tablegen(llvm-tblgen LLVM
X86RecognizableInstr.cpp
CTagsEmitter.cpp
)
+set_target_properties(llvm-tblgen PROPERTIES FOLDER "Tablegenning")
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index f6be8da02c3..3b400c1262e 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -603,6 +603,11 @@ bool TypeInfer::EnforceVectorSubVectorTypeIs(TypeSetByHwMode &Vec,
auto IsSubVec = [](MVT B, MVT P) -> bool {
if (!B.isVector() || !P.isVector())
return false;
+ // Logically a <4 x i32> is a valid subvector of <n x 4 x i32>
+ // but until there are obvious use-cases for this, keep the
+ // types separate.
+ if (B.isScalableVector() != P.isScalableVector())
+ return false;
if (B.getVectorElementType() != P.getVectorElementType())
return false;
return B.getVectorNumElements() < P.getVectorNumElements();
diff --git a/utils/TableGen/DAGISelMatcherEmitter.cpp b/utils/TableGen/DAGISelMatcherEmitter.cpp
index 76370cdad67..672f9f8620f 100644
--- a/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -974,7 +974,7 @@ void llvm::EmitMatcherTable(const Matcher *TheMatcher,
OS << " #undef TARGET_VAL\n";
OS << " SelectCodeCommon(N, MatcherTable,sizeof(MatcherTable));\n";
- OS << "}\n";
+ OS << "}\n\n";
// Next up, emit the function for node and pattern predicates:
MatcherEmitter.EmitPredicateFunctions(OS);
diff --git a/utils/TableGen/GlobalISelEmitter.cpp b/utils/TableGen/GlobalISelEmitter.cpp
index fed8ae5a80b..08649d7f9b5 100644
--- a/utils/TableGen/GlobalISelEmitter.cpp
+++ b/utils/TableGen/GlobalISelEmitter.cpp
@@ -2629,6 +2629,9 @@ Error GlobalISelEmitter::importChildMatcher(RuleMatcher &Rule,
return Error::success();
}
+ if (SrcChild->hasAnyPredicate())
+ return failedImport("Src pattern child has unsupported predicate");
+
// Check for constant immediates.
if (auto *ChildInt = dyn_cast<IntInit>(SrcChild->getLeafValue())) {
OM.addPredicate<ConstantIntOperandMatcher>(ChildInt->getValue());
diff --git a/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp b/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
index 848e59c0790..05f30facd54 100644
--- a/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
+++ b/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
@@ -155,6 +155,14 @@ void X86EVEX2VEXTablesEmitter::printTable(const std::vector<Entry> &Table,
{"VALIGNQZ128rri", "VPALIGNRrri", true},
{"VALIGNDZ128rmi", "VPALIGNRrmi", true},
{"VALIGNQZ128rmi", "VPALIGNRrmi", true},
+ {"VSHUFF32X4Z256rmi", "VPERM2F128rm", false},
+ {"VSHUFF32X4Z256rri", "VPERM2F128rr", false},
+ {"VSHUFF64X2Z256rmi", "VPERM2F128rm", false},
+ {"VSHUFF64X2Z256rri", "VPERM2F128rr", false},
+ {"VSHUFI32X4Z256rmi", "VPERM2I128rm", false},
+ {"VSHUFI32X4Z256rri", "VPERM2I128rr", false},
+ {"VSHUFI64X2Z256rmi", "VPERM2I128rm", false},
+ {"VSHUFI64X2Z256rri", "VPERM2I128rr", false},
};
// Print the manually added entries
diff --git a/utils/update_llc_test_checks.py b/utils/update_llc_test_checks.py
index e4e7e2ce41e..c3320f204a0 100755
--- a/utils/update_llc_test_checks.py
+++ b/utils/update_llc_test_checks.py
@@ -58,6 +58,12 @@ ASM_FUNCTION_PPC_RE = re.compile(
r'.Lfunc_end[0-9]+:\n',
flags=(re.M | re.S))
+ASM_FUNCTION_RISCV_RE = re.compile(
+ r'^_?(?P<func>[^:]+):[ \t]*#+[ \t]*@(?P=func)\n[^:]*?'
+ r'(?P<body>^##?[ \t]+[^:]+:.*?)\s*'
+ r'.Lfunc_end[0-9]+:\n',
+ flags=(re.M | re.S))
+
ASM_FUNCTION_SYSTEMZ_RE = re.compile(
r'^_?(?P<func>[^:]+):[ \t]*#+[ \t]*@(?P=func)\n'
r'[ \t]+.cfi_startproc\n'
@@ -135,6 +141,16 @@ def scrub_asm_powerpc64(asm, args):
asm = SCRUB_TRAILING_WHITESPACE_RE.sub(r'', asm)
return asm
+def scrub_asm_riscv(asm):
+ # Scrub runs of whitespace out of the assembly, but leave the leading
+ # whitespace in place.
+ asm = SCRUB_WHITESPACE_RE.sub(r' ', asm)
+ # Expand the tabs used for indentation.
+ asm = string.expandtabs(asm, 2)
+ # Strip trailing whitespace.
+ asm = SCRUB_TRAILING_WHITESPACE_RE.sub(r'', asm)
+ return asm
+
def scrub_asm_systemz(asm, args):
# Scrub runs of whitespace out of the assembly, but leave the leading
# whitespace in place.
@@ -161,6 +177,8 @@ def build_function_body_dictionary(raw_tool_output, triple, prefixes, func_dict,
'armeb-eabi': (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE),
'powerpc64': (scrub_asm_powerpc64, ASM_FUNCTION_PPC_RE),
'powerpc64le': (scrub_asm_powerpc64, ASM_FUNCTION_PPC_RE),
+ 'riscv32': (scrub_asm_riscv, ASM_FUNCTION_RISCV_RE),
+ 'riscv64': (scrub_asm_riscv, ASM_FUNCTION_RISCV_RE),
's390x': (scrub_asm_systemz, ASM_FUNCTION_SYSTEMZ_RE),
}
handlers = None
diff --git a/utils/update_mir_test_checks.py b/utils/update_mir_test_checks.py
index 015c4279bad..2934f09f6b3 100755
--- a/utils/update_mir_test_checks.py
+++ b/utils/update_mir_test_checks.py
@@ -254,7 +254,8 @@ def add_check_lines(test, output_lines, prefix, func_name, single_bb,
func_line = func_line.replace(
vreg.group(1), '[[{}:%[0-9]+]]'.format(name), 1)
for number, name in vreg_map.items():
- func_line = func_line.replace(number, '[[{}]]'.format(name))
+ func_line = re.sub(r'{}\b'.format(number), '[[{}]]'.format(name),
+ func_line)
check_line = '{}: {}'.format(check, func_line[indent:]).rstrip()
output_lines.append(check_line)