From 652842ec9ac1a6730335ad89827eb4133c0253fd Mon Sep 17 00:00:00 2001 From: Simon Dardis Date: Thu, 2 Nov 2017 12:47:22 +0000 Subject: [mips] Use register scavenging with MSA. MSA stores and loads to the stack are more likely to require an emergency GPR spill slot due to the smaller offsets available with those instructions. Handle this by overestimating the size of the stack by determining the largest offset presuming that all callee save registers are spilled and accounting of incoming arguments when determining whether an emergency spill slot is required. Reviewers: atanasyan Differential Revision: https://reviews.llvm.org/D39056 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317204 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MipsFrameLowering.cpp | 35 ++--- lib/Target/Mips/MipsSEFrameLowering.cpp | 8 +- test/CodeGen/Mips/msa/emergency-spill.mir | 221 ++++++++++++++++++++++++++++++ test/CodeGen/Mips/msa/frameindex.ll | 49 ++++--- 4 files changed, 272 insertions(+), 41 deletions(-) create mode 100644 test/CodeGen/Mips/msa/emergency-spill.mir diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp index ef05166503b..27a85970da6 100644 --- a/lib/Target/Mips/MipsFrameLowering.cpp +++ b/lib/Target/Mips/MipsFrameLowering.cpp @@ -107,38 +107,31 @@ bool MipsFrameLowering::hasBP(const MachineFunction &MF) const { return MFI.hasVarSizedObjects() && TRI->needsStackRealignment(MF); } +// Estimate the size of the stack, including the incoming arguments. We need to +// account for register spills, local objects, reserved call frame and incoming +// arguments. This is required to determine the largest possible positive offset +// from $sp so that it can be determined if an emergency spill slot for stack +// addresses is required. uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); - int64_t Offset = 0; + int64_t Size = 0; - // Iterate over fixed sized objects. + // Iterate over fixed sized objects which are incoming arguments. for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) - Offset = std::max(Offset, -MFI.getObjectOffset(I)); + if (MFI.getObjectOffset(I) > 0) + Size += MFI.getObjectSize(I); // Conservatively assume all callee-saved registers will be saved. for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) { - unsigned Size = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R)); - Offset = alignTo(Offset + Size, Size); + unsigned RegSize = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R)); + Size = alignTo(Size + RegSize, RegSize); } - unsigned MaxAlign = MFI.getMaxAlignment(); - - // Check that MaxAlign is not zero if there is a stack object that is not a - // callee-saved spill. - assert(!MFI.getObjectIndexEnd() || MaxAlign); - - // Iterate over other objects. - for (unsigned I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) - Offset = alignTo(Offset + MFI.getObjectSize(I), MaxAlign); - - // Call frame. - if (MFI.adjustsStack() && hasReservedCallFrame(MF)) - Offset = alignTo(Offset + MFI.getMaxCallFrameSize(), - std::max(MaxAlign, getStackAlignment())); - - return alignTo(Offset, getStackAlignment()); + // Get the size of the rest of the frame objects and any possible reserved + // call frame, accounting for alignment. + return Size + MFI.estimateStackSize(MF); } // Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp index 0b19b18449e..ca19089c912 100644 --- a/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -893,10 +893,12 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF, } // Set scavenging frame index if necessary. - uint64_t MaxSPOffset = MF.getInfo()->getIncomingArgSize() + - estimateStackSize(MF); + uint64_t MaxSPOffset = estimateStackSize(MF); - if (isInt<16>(MaxSPOffset)) + // MSA has a minimum offset of 10 bits signed. If there is a variable + // sized object on the stack, the estimation cannot account for it. + if (isIntN(STI.hasMSA() ? 10 : 16, MaxSPOffset) && + !MF.getFrameInfo().hasVarSizedObjects()) return; const TargetRegisterClass &RC = diff --git a/test/CodeGen/Mips/msa/emergency-spill.mir b/test/CodeGen/Mips/msa/emergency-spill.mir new file mode 100644 index 00000000000..502b60f673e --- /dev/null +++ b/test/CodeGen/Mips/msa/emergency-spill.mir @@ -0,0 +1,221 @@ +# RUN: llc %s -start-after=shrink-wrap -march=mips64 -mcpu=mips64r6 -mattr=+fp64,+msa -o /dev/null + +# Test that estimated size of the stack leads to the creation of an emergency +# spill when MSA is in use. Previously, this test case would fail during +# register scavenging due to the lack of a spill slot. +--- | + define inreg { i64, i64 } @test(i64 inreg %a.coerce0, i64 inreg %a.coerce1, i64 inreg %b.coerce0, i64 inreg %b.coerce1, i32 signext %c) #0 { + entry: + %retval = alloca <16 x i8>, align 16 + %a = alloca <16 x i8>, align 16 + %b = alloca <16 x i8>, align 16 + %a.addr = alloca <16 x i8>, align 16 + %b.addr = alloca <16 x i8>, align 16 + %c.addr = alloca i32, align 4 + %g = alloca <16 x i8>*, align 8 + %d = alloca i8*, align 8 + %0 = bitcast <16 x i8>* %a to { i64, i64 }* + %1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i32 0, i32 0 + store i64 %a.coerce0, i64* %1, align 16 + %2 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i32 0, i32 1 + store i64 %a.coerce1, i64* %2, align 8 + %a1 = load <16 x i8>, <16 x i8>* %a, align 16 + %3 = bitcast <16 x i8>* %b to { i64, i64 }* + %4 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %3, i32 0, i32 0 + store i64 %b.coerce0, i64* %4, align 16 + %5 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %3, i32 0, i32 1 + store i64 %b.coerce1, i64* %5, align 8 + %b2 = load <16 x i8>, <16 x i8>* %b, align 16 + store <16 x i8> %a1, <16 x i8>* %a.addr, align 16 + store <16 x i8> %b2, <16 x i8>* %b.addr, align 16 + store i32 %c, i32* %c.addr, align 4 + %6 = alloca i8, i64 6400, align 16 + %7 = bitcast i8* %6 to <16 x i8>* + store <16 x i8>* %7, <16 x i8>** %g, align 8 + %8 = load <16 x i8>*, <16 x i8>** %g, align 8 + call void @h(<16 x i8>* %b.addr, <16 x i8>* %8) + %9 = load <16 x i8>*, <16 x i8>** %g, align 8 + %10 = bitcast <16 x i8>* %9 to i8* + store i8* %10, i8** %d, align 8 + %11 = load <16 x i8>, <16 x i8>* %a.addr, align 16 + %12 = load i8*, i8** %d, align 8 + %arrayidx = getelementptr inbounds i8, i8* %12, i64 0 + %13 = load i8, i8* %arrayidx, align 1 + %conv = sext i8 %13 to i32 + %14 = call <16 x i8> @llvm.mips.fill.b(i32 %conv) + %add = add <16 x i8> %11, %14 + %15 = load i8*, i8** %d, align 8 + %arrayidx3 = getelementptr inbounds i8, i8* %15, i64 1 + %16 = load i8, i8* %arrayidx3, align 1 + %conv4 = sext i8 %16 to i32 + %17 = call <16 x i8> @llvm.mips.fill.b(i32 %conv4) + %add5 = add <16 x i8> %add, %17 + %18 = load <16 x i8>, <16 x i8>* %b.addr, align 16 + %add6 = add <16 x i8> %18, %add5 + store <16 x i8> %add6, <16 x i8>* %b.addr, align 16 + %19 = load <16 x i8>, <16 x i8>* %b.addr, align 16 + store <16 x i8> %19, <16 x i8>* %retval, align 16 + %20 = bitcast <16 x i8>* %retval to { i64, i64 }* + %21 = load { i64, i64 }, { i64, i64 }* %20, align 16 + ret { i64, i64 } %21 + } + + declare void @h(<16 x i8>*, <16 x i8>*) + + declare <16 x i8> @llvm.mips.fill.b(i32) + + declare void @llvm.stackprotector(i8*, i8**) + +... +--- +name: test +alignment: 3 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: +liveins: + - { reg: '%a0_64', virtual-reg: '' } + - { reg: '%a1_64', virtual-reg: '' } + - { reg: '%a2_64', virtual-reg: '' } + - { reg: '%a3_64', virtual-reg: '' } + - { reg: '%t0_64', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 16 + adjustsStack: false + hasCalls: true + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: + - { id: 0, name: retval, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 1, name: a, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 2, name: b, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 3, name: a.addr, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 4, name: b.addr, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 5, name: c.addr, type: default, offset: 0, size: 4, alignment: 4, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 6, name: g, type: default, offset: 0, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 7, name: d, type: default, offset: 0, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 8, name: '', type: default, offset: 0, size: 6400, + alignment: 16, stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } +constants: +body: | + bb.0.entry: + liveins: %a0_64, %a1_64, %a2_64, %a3_64, %t0_64 + + SD killed %a0_64, %stack.1.a, 0 :: (store 8 into %ir.1, align 16) + SD killed %a1_64, %stack.1.a, 8 :: (store 8 into %ir.2) + %w0 = LD_B %stack.1.a, 0 :: (dereferenceable load 16 from %ir.a) + SD killed %a2_64, %stack.2.b, 0 :: (store 8 into %ir.4, align 16) + SD killed %a3_64, %stack.2.b, 8 :: (store 8 into %ir.5) + %w1 = LD_B %stack.2.b, 0 :: (dereferenceable load 16 from %ir.b) + ST_B killed %w0, %stack.3.a.addr, 0 :: (store 16 into %ir.a.addr) + ST_B killed %w1, %stack.4.b.addr, 0 :: (store 16 into %ir.b.addr) + SW %t0, %stack.5.c.addr, 0, implicit killed %t0_64 :: (store 4 into %ir.c.addr) + %at_64 = LEA_ADDiu64 %stack.8, 0 + SD killed %at_64, %stack.6.g, 0 :: (store 8 into %ir.g) + %a1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp + %a0_64 = LEA_ADDiu64 %stack.4.b.addr, 0 + JAL @h, csr_n64, implicit-def dead %ra, implicit %a0_64, implicit %a1_64, implicit-def %sp + ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp + %at_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %v0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %v1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %a0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %a1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %a2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %a3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t4_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t5_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t6_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t7_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s4_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s5_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s6_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s7_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t8_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t9_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %ra_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %w0 = LD_B %stack.3.a.addr, 0 :: (dereferenceable load 16 from %ir.a.addr) + SD %at_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %v0_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %v1_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %a0_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %a1_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %a2_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %a3_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t0_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t1_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t2_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t3_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t4_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t5_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t6_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t7_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s0_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s1_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s2_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s3_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s4_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s5_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s6_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s7_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t8_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t9_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %ra_64, %stack.7.d, 0 :: (store 8 into %ir.d) + %at_64 = LD %stack.7.d, 0 :: (dereferenceable load 8 from %ir.d) + %v0 = LB %at_64, 0 :: (load 1 from %ir.arrayidx) + %w1 = FILL_B killed %v0 + %w0 = ADDV_B killed %w0, killed %w1 + %at = LB killed %at_64, 1 :: (load 1 from %ir.arrayidx3) + %w1 = FILL_B killed %at + %w0 = ADDV_B killed %w0, killed %w1 + %w1 = LD_B %stack.4.b.addr, 0 :: (dereferenceable load 16 from %ir.b.addr) + %w0 = ADDV_B killed %w1, killed %w0 + ST_B killed %w0, %stack.4.b.addr, 0 :: (store 16 into %ir.b.addr) + %w0 = LD_B %stack.4.b.addr, 0 :: (dereferenceable load 16 from %ir.b.addr) + ST_B killed %w0, %stack.0.retval, 0 :: (store 16 into %ir.retval) + %v0_64 = LD %stack.0.retval, 0 :: (dereferenceable load 8 from %ir.20, align 16) + %v1_64 = LD %stack.0.retval, 8 :: (dereferenceable load 8 from %ir.20 + 8, align 16) + RetRA implicit %v0_64, implicit %v1_64 + +... diff --git a/test/CodeGen/Mips/msa/frameindex.ll b/test/CodeGen/Mips/msa/frameindex.ll index f903381f9ef..9c2228d3bf6 100644 --- a/test/CodeGen/Mips/msa/frameindex.ll +++ b/test/CodeGen/Mips/msa/frameindex.ll @@ -18,7 +18,8 @@ define void @loadstore_v16i8_just_under_simm10() nounwind { ; MIPS32-AE: loadstore_v16i8_just_under_simm10: %1 = alloca <16 x i8> - %2 = alloca [496 x i8] ; Push the frame right up to 512 bytes + %2 = alloca [492 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 512 bytes %3 = load volatile <16 x i8>, <16 x i8>* %1 ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 496($sp) @@ -33,7 +34,8 @@ define void @loadstore_v16i8_just_over_simm10() nounwind { ; MIPS32-AE: loadstore_v16i8_just_over_simm10: %1 = alloca <16 x i8> - %2 = alloca [497 x i8] ; Push the frame just over 512 bytes + %2 = alloca [497 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 512 bytes %3 = load volatile <16 x i8>, <16 x i8>* %1 ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 512 @@ -50,7 +52,8 @@ define void @loadstore_v16i8_just_under_simm16() nounwind { ; MIPS32-AE: loadstore_v16i8_just_under_simm16: %1 = alloca <16 x i8> - %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes + %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 32768 bytes %3 = load volatile <16 x i8>, <16 x i8>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -69,7 +72,8 @@ define void @loadstore_v16i8_just_over_simm16() nounwind { ; MIPS32-AE: loadstore_v16i8_just_over_simm16: %1 = alloca <16 x i8> - %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes + %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 32768 bytes %3 = load volatile <16 x i8>, <16 x i8>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -121,7 +125,8 @@ define void @loadstore_v8i16_just_under_simm10() nounwind { ; MIPS32-AE: loadstore_v8i16_just_under_simm10: %1 = alloca <8 x i16> - %2 = alloca [1008 x i8] ; Push the frame right up to 1024 bytes + %2 = alloca [1004 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 1024 bytes %3 = load volatile <8 x i16>, <8 x i16>* %1 ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 1008($sp) @@ -136,7 +141,8 @@ define void @loadstore_v8i16_just_over_simm10() nounwind { ; MIPS32-AE: loadstore_v8i16_just_over_simm10: %1 = alloca <8 x i16> - %2 = alloca [1009 x i8] ; Push the frame just over 1024 bytes + %2 = alloca [1009 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 1024 bytes %3 = load volatile <8 x i16>, <8 x i16>* %1 ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1024 @@ -153,7 +159,8 @@ define void @loadstore_v8i16_just_under_simm16() nounwind { ; MIPS32-AE: loadstore_v8i16_just_under_simm16: %1 = alloca <8 x i16> - %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes + %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 32768 bytes %3 = load volatile <8 x i16>, <8 x i16>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -172,7 +179,8 @@ define void @loadstore_v8i16_just_over_simm16() nounwind { ; MIPS32-AE: loadstore_v8i16_just_over_simm16: %1 = alloca <8 x i16> - %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes + %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 32768 bytes %3 = load volatile <8 x i16>, <8 x i16>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -224,7 +232,8 @@ define void @loadstore_v4i32_just_under_simm10() nounwind { ; MIPS32-AE: loadstore_v4i32_just_under_simm10: %1 = alloca <4 x i32> - %2 = alloca [2032 x i8] ; Push the frame right up to 2048 bytes + %2 = alloca [2028 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 2048 bytes %3 = load volatile <4 x i32>, <4 x i32>* %1 ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 2032($sp) @@ -239,7 +248,8 @@ define void @loadstore_v4i32_just_over_simm10() nounwind { ; MIPS32-AE: loadstore_v4i32_just_over_simm10: %1 = alloca <4 x i32> - %2 = alloca [2033 x i8] ; Push the frame just over 2048 bytes + %2 = alloca [2033 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 2048 bytes %3 = load volatile <4 x i32>, <4 x i32>* %1 ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 2048 @@ -256,7 +266,8 @@ define void @loadstore_v4i32_just_under_simm16() nounwind { ; MIPS32-AE: loadstore_v4i32_just_under_simm16: %1 = alloca <4 x i32> - %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes + %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill + ; slot-- right up to 32768 bytes %3 = load volatile <4 x i32>, <4 x i32>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -275,7 +286,8 @@ define void @loadstore_v4i32_just_over_simm16() nounwind { ; MIPS32-AE: loadstore_v4i32_just_over_simm16: %1 = alloca <4 x i32> - %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes + %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 32768 bytes %3 = load volatile <4 x i32>, <4 x i32>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -327,8 +339,8 @@ define void @loadstore_v2i64_just_under_simm10() nounwind { ; MIPS32-AE: loadstore_v2i64_just_under_simm10: %1 = alloca <2 x i64> - %2 = alloca [4080 x i8] ; Push the frame right up to 4096 bytes - + %2 = alloca [4076 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 4096 bytes %3 = load volatile <2 x i64>, <2 x i64>* %1 ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 4080($sp) store volatile <2 x i64> %3, <2 x i64>* %1 @@ -342,7 +354,8 @@ define void @loadstore_v2i64_just_over_simm10() nounwind { ; MIPS32-AE: loadstore_v2i64_just_over_simm10: %1 = alloca <2 x i64> - %2 = alloca [4081 x i8] ; Push the frame just over 4096 bytes + %2 = alloca [4081 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 4096 bytes %3 = load volatile <2 x i64>, <2 x i64>* %1 ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 4096 @@ -359,7 +372,8 @@ define void @loadstore_v2i64_just_under_simm16() nounwind { ; MIPS32-AE: loadstore_v2i64_just_under_simm16: %1 = alloca <2 x i64> - %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes + %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 32768 bytes %3 = load volatile <2 x i64>, <2 x i64>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -378,7 +392,8 @@ define void @loadstore_v2i64_just_over_simm16() nounwind { ; MIPS32-AE: loadstore_v2i64_just_over_simm16: %1 = alloca <2 x i64> - %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes + %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 32768 bytes %3 = load volatile <2 x i64>, <2 x i64>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 -- cgit v1.2.3 From a223e9099142f78a1c2463b83d5351cdfa3d2fc1 Mon Sep 17 00:00:00 2001 From: Ayman Musa Date: Thu, 2 Nov 2017 13:07:06 +0000 Subject: [X86] Fix bug in legalize vector types - Split large loads When splitting a large load to smaller legally-typed loads, the last load should be padded to reach the size of the previous one so a CONCAT_VECTORS node could reunite them again. The code currently pads the last load to reach the size of the first load (instead of the previous). Differential Revision: https://reviews.llvm.org/D38495 Change-Id: Ib60b55ed26ce901fabf68108daf52683fbd5013f git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317206 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 2 +- test/CodeGen/X86/pr34653.ll | 209 +++++++++++++++++++++++ test/CodeGen/X86/pr34657.ll | 20 +++ 3 files changed, 230 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/X86/pr34653.ll create mode 100644 test/CodeGen/X86/pr34657.ll diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 5d6c4998ecd..b55414b51b8 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3844,7 +3844,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl &LdChain, } LdOps.push_back(L); - + LdOp = L; LdWidth -= NewVTWidth; } diff --git a/test/CodeGen/X86/pr34653.ll b/test/CodeGen/X86/pr34653.ll new file mode 100644 index 00000000000..4b16ffd33d5 --- /dev/null +++ b/test/CodeGen/X86/pr34653.ll @@ -0,0 +1,209 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+avx512f -o - | FileCheck %s + +declare fastcc <38 x double> @test() + +define void @pr34653() { +; CHECK-LABEL: pr34653: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-512, %rsp # imm = 0xFE00 +; CHECK-NEXT: subq $2048, %rsp # imm = 0x800 +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: callq test +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vmovaps %xmm0, %xmm1 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm2 +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3 +; CHECK-NEXT: vmovaps %xmm3, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm5 +; CHECK-NEXT: vmovaps %xmm5, %xmm6 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm7 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm8 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm9 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm10 +; CHECK-NEXT: vextractf32x4 $3, %zmm10, %xmm11 +; CHECK-NEXT: vmovaps %xmm11, %xmm12 +; CHECK-NEXT: vextractf32x4 $2, %zmm10, %xmm13 +; CHECK-NEXT: vmovaps %xmm13, %xmm14 +; CHECK-NEXT: vmovaps %xmm10, %xmm15 +; CHECK-NEXT: vmovaps %xmm15, %xmm2 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $3, %zmm9, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $2, %zmm9, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm9, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $3, %zmm8, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $2, %zmm8, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm8, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $3, %zmm7, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $2, %zmm7, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm7, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm11 = xmm11[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm13[1,0] +; CHECK-NEXT: # kill: %YMM10 %YMM10 %ZMM10 +; CHECK-NEXT: vextractf128 $1, %ymm10, %xmm10 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm10, %xmm0 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm15 = xmm15[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: # kill: %YMM9 %YMM9 %ZMM9 +; CHECK-NEXT: vextractf128 $1, %ymm9, %xmm9 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm9, %xmm0 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: # kill: %YMM8 %YMM8 %ZMM8 +; CHECK-NEXT: vextractf128 $1, %ymm8, %xmm8 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm8, %xmm0 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: # kill: %YMM7 %YMM7 %ZMM7 +; CHECK-NEXT: vextractf128 $1, %ymm7, %xmm7 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm7, %xmm0 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm8, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm13, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm1, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm14, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm2, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm4, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm9, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm10, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm15, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm11, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm3, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm6, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm5, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm12, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm7, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %v = call fastcc <38 x double> @test() + %v.0 = extractelement <38 x double> %v, i32 0 + ret void +} + diff --git a/test/CodeGen/X86/pr34657.ll b/test/CodeGen/X86/pr34657.ll new file mode 100644 index 00000000000..a63bc2a08dd --- /dev/null +++ b/test/CodeGen/X86/pr34657.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw -o - | FileCheck %s + +define <112 x i8> @pr34657() local_unnamed_addr { +; CHECK-LABEL: pr34657 +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vmovups (%rax), %xmm0 +; CHECK-NEXT: vmovups (%rax), %ymm1 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovups (%rax), %zmm2 +; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) +; CHECK-NEXT: vmovaps %zmm2, (%rdi) +; CHECK-NEXT: vextractf32x4 $2, %zmm0, 96(%rdi) +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %wide.vec51 = load <112 x i8>, <112 x i8>* undef, align 2 + ret <112 x i8> %wide.vec51 +} -- cgit v1.2.3 From f08c3d1d13d0fdc28dff010a88bd9f960c5ea7a9 Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Thu, 2 Nov 2017 15:02:51 +0000 Subject: [ExpandMemCmp] Split ExpandMemCmp from CodeGen into its own pass. Summary: This is mostly a noop (most of the test diffs are renamed blocks). There are a few temporary register renames (eax<->ecx) and a few blocks are shuffled around. See the discussion in PR33325 for more details. Reviewers: spatel Subscribers: mgorny Differential Revision: https://reviews.llvm.org/D39456 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317211 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/InitializePasses.h | 1 + include/llvm/LinkAllPasses.h | 1 + include/llvm/Transforms/Scalar.h | 8 +- lib/CodeGen/CodeGenPrepare.cpp | 710 --------------------- lib/CodeGen/TargetPassConfig.cpp | 10 +- lib/Transforms/Scalar/CMakeLists.txt | 1 + lib/Transforms/Scalar/ExpandMemCmp.cpp | 828 +++++++++++++++++++++++++ lib/Transforms/Scalar/Scalar.cpp | 1 + test/CodeGen/Generic/llc-start-stop.ll | 6 +- test/CodeGen/X86/memcmp-optsize.ll | 224 ++++--- test/CodeGen/X86/memcmp.ll | 240 ++++--- test/Transforms/CodeGenPrepare/X86/memcmp.ll | 771 ----------------------- test/Transforms/ExpandMemCmp/X86/lit.local.cfg | 3 + test/Transforms/ExpandMemCmp/X86/memcmp.ll | 792 +++++++++++++++++++++++ 14 files changed, 1874 insertions(+), 1722 deletions(-) create mode 100644 lib/Transforms/Scalar/ExpandMemCmp.cpp delete mode 100644 test/Transforms/CodeGenPrepare/X86/memcmp.ll create mode 100644 test/Transforms/ExpandMemCmp/X86/lit.local.cfg create mode 100644 test/Transforms/ExpandMemCmp/X86/memcmp.ll diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index c3ad8fe41af..67a077081f7 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -128,6 +128,7 @@ void initializeEdgeBundlesPass(PassRegistry&); void initializeEfficiencySanitizerPass(PassRegistry&); void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&); void initializeExpandISelPseudosPass(PassRegistry&); +void initializeExpandMemCmpPassPass(PassRegistry&); void initializeExpandPostRAPass(PassRegistry&); void initializeExpandReductionsPass(PassRegistry&); void initializeExternalAAWrapperPassPass(PassRegistry&); diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index 765e63926da..ce70f53ccb0 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -180,6 +180,7 @@ namespace { (void) llvm::createReversePostOrderFunctionAttrsPass(); (void) llvm::createMergeFunctionsPass(); (void) llvm::createMergeICmpsPass(); + (void) llvm::createExpandMemCmpPass(); std::string buf; llvm::raw_string_ostream os(buf); (void) llvm::createPrintModulePass(os); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index 8ef65774a93..4b365858787 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -422,10 +422,16 @@ Pass *createLowerGuardIntrinsicPass(); //===----------------------------------------------------------------------===// // -// MergeICmps - Merge integer comparison chains +// MergeICmps - Merge integer comparison chains into a memcmp // Pass *createMergeICmpsPass(); +//===----------------------------------------------------------------------===// +// +// ExpandMemCmp - Expand memcmp() to load/stores. +// +Pass *createExpandMemCmpPass(); + //===----------------------------------------------------------------------===// // // ValuePropagation - Propagate CFG-derived value information diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 51f2a320b29..973ddebd987 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -123,12 +123,6 @@ STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); -STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); -STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); -STATISTIC(NumMemCmpGreaterThanMax, - "Number of memcmp calls with size greater than max size"); -STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); - static cl::opt DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); @@ -189,11 +183,6 @@ EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, cl::desc("Enable merging of redundant sexts when one is dominating" " the other."), cl::init(true)); -static cl::opt MemCmpNumLoadsPerBlock( - "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), - cl::desc("The number of loads per basic block for inline expansion of " - "memcmp that is only being compared against zero.")); - namespace { using SetOfInstrs = SmallPtrSet; @@ -1697,699 +1686,6 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, return true; } -namespace { - -// This class provides helper functions to expand a memcmp library call into an -// inline expansion. -class MemCmpExpansion { - struct ResultBlock { - BasicBlock *BB = nullptr; - PHINode *PhiSrc1 = nullptr; - PHINode *PhiSrc2 = nullptr; - - ResultBlock() = default; - }; - - CallInst *const CI; - ResultBlock ResBlock; - const uint64_t Size; - unsigned MaxLoadSize; - uint64_t NumLoadsNonOneByte; - const uint64_t NumLoadsPerBlock; - std::vector LoadCmpBlocks; - BasicBlock *EndBlock; - PHINode *PhiRes; - const bool IsUsedForZeroCmp; - const DataLayout &DL; - IRBuilder<> Builder; - // Represents the decomposition in blocks of the expansion. For example, - // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and - // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}. - // TODO(courbet): Involve the target more in this computation. On X86, 7 - // bytes can be done more efficiently with two overlaping 4-byte loads than - // covering the interval with [{4, 0},{2, 4},{1, 6}}. - struct LoadEntry { - LoadEntry(unsigned LoadSize, uint64_t Offset) - : LoadSize(LoadSize), Offset(Offset) { - assert(Offset % LoadSize == 0 && "invalid load entry"); - } - - uint64_t getGEPIndex() const { return Offset / LoadSize; } - - // The size of the load for this block, in bytes. - const unsigned LoadSize; - // The offset of this load WRT the base pointer, in bytes. - const uint64_t Offset; - }; - SmallVector LoadSequence; - - void createLoadCmpBlocks(); - void createResultBlock(); - void setupResultBlockPHINodes(); - void setupEndBlockPHINodes(); - Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex); - void emitLoadCompareBlock(unsigned BlockIndex); - void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, - unsigned &LoadIndex); - void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex); - void emitMemCmpResultBlock(); - Value *getMemCmpExpansionZeroCase(); - Value *getMemCmpEqZeroOneBlock(); - Value *getMemCmpOneBlock(); - - public: - MemCmpExpansion(CallInst *CI, uint64_t Size, - const TargetTransformInfo::MemCmpExpansionOptions &Options, - unsigned MaxNumLoads, const bool IsUsedForZeroCmp, - unsigned NumLoadsPerBlock, const DataLayout &DL); - - unsigned getNumBlocks(); - uint64_t getNumLoads() const { return LoadSequence.size(); } - - Value *getMemCmpExpansion(); -}; - -} // end anonymous namespace - -// Initialize the basic block structure required for expansion of memcmp call -// with given maximum load size and memcmp size parameter. -// This structure includes: -// 1. A list of load compare blocks - LoadCmpBlocks. -// 2. An EndBlock, split from original instruction point, which is the block to -// return from. -// 3. ResultBlock, block to branch to for early exit when a -// LoadCmpBlock finds a difference. -MemCmpExpansion::MemCmpExpansion( - CallInst *const CI, uint64_t Size, - const TargetTransformInfo::MemCmpExpansionOptions &Options, - const unsigned MaxNumLoads, const bool IsUsedForZeroCmp, - const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout) - : CI(CI), - Size(Size), - MaxLoadSize(0), - NumLoadsNonOneByte(0), - NumLoadsPerBlock(NumLoadsPerBlock), - IsUsedForZeroCmp(IsUsedForZeroCmp), - DL(TheDataLayout), - Builder(CI) { - assert(Size > 0 && "zero blocks"); - // Scale the max size down if the target can load more bytes than we need. - size_t LoadSizeIndex = 0; - while (LoadSizeIndex < Options.LoadSizes.size() && - Options.LoadSizes[LoadSizeIndex] > Size) { - ++LoadSizeIndex; - } - this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; - // Compute the decomposition. - uint64_t CurSize = Size; - uint64_t Offset = 0; - while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { - const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; - assert(LoadSize > 0 && "zero load size"); - const uint64_t NumLoadsForThisSize = CurSize / LoadSize; - if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { - // Do not expand if the total number of loads is larger than what the - // target allows. Note that it's important that we exit before completing - // the expansion to avoid using a ton of memory to store the expansion for - // large sizes. - LoadSequence.clear(); - return; - } - if (NumLoadsForThisSize > 0) { - for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { - LoadSequence.push_back({LoadSize, Offset}); - Offset += LoadSize; - } - if (LoadSize > 1) { - ++NumLoadsNonOneByte; - } - CurSize = CurSize % LoadSize; - } - ++LoadSizeIndex; - } - assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); -} - -unsigned MemCmpExpansion::getNumBlocks() { - if (IsUsedForZeroCmp) - return getNumLoads() / NumLoadsPerBlock + - (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0); - return getNumLoads(); -} - -void MemCmpExpansion::createLoadCmpBlocks() { - for (unsigned i = 0; i < getNumBlocks(); i++) { - BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", - EndBlock->getParent(), EndBlock); - LoadCmpBlocks.push_back(BB); - } -} - -void MemCmpExpansion::createResultBlock() { - ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", - EndBlock->getParent(), EndBlock); -} - -// This function creates the IR instructions for loading and comparing 1 byte. -// It loads 1 byte from each source of the memcmp parameters with the given -// GEPIndex. It then subtracts the two loaded values and adds this result to the -// final phi node for selecting the memcmp result. -void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, - unsigned GEPIndex) { - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using the GEPIndex. - if (GEPIndex != 0) { - Source1 = Builder.CreateGEP(LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, GEPIndex)); - Source2 = Builder.CreateGEP(LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, GEPIndex)); - } - - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); - Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); - - PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]); - - if (BlockIndex < (LoadCmpBlocks.size() - 1)) { - // Early exit branch if difference found to EndBlock. Otherwise, continue to - // next LoadCmpBlock, - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, - ConstantInt::get(Diff->getType(), 0)); - BranchInst *CmpBr = - BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp); - Builder.Insert(CmpBr); - } else { - // The last block has an unconditional branch to EndBlock. - BranchInst *CmpBr = BranchInst::Create(EndBlock); - Builder.Insert(CmpBr); - } -} - -/// Generate an equality comparison for one or more pairs of loaded values. -/// This is used in the case where the memcmp() call is compared equal or not -/// equal to zero. -Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex, - unsigned &LoadIndex) { - assert(LoadIndex < getNumLoads() && - "getCompareLoadPairs() called with no remaining loads"); - std::vector XorList, OrList; - Value *Diff; - - const unsigned NumLoads = - std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock); - - // For a single-block expansion, start inserting before the memcmp call. - if (LoadCmpBlocks.empty()) - Builder.SetInsertPoint(CI); - else - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - - Value *Cmp = nullptr; - // If we have multiple loads per block, we need to generate a composite - // comparison using xor+or. The type for the combinations is the largest load - // type. - IntegerType *const MaxLoadType = - NumLoads == 1 ? nullptr - : IntegerType::get(CI->getContext(), MaxLoadSize * 8); - for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) { - const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex]; - - IntegerType *LoadSizeType = - IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); - - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using a GEP. - if (CurLoadEntry.Offset != 0) { - Source1 = Builder.CreateGEP( - LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - Source2 = Builder.CreateGEP( - LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - } - - // Get a constant or load a value for each source address. - Value *LoadSrc1 = nullptr; - if (auto *Source1C = dyn_cast(Source1)) - LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL); - if (!LoadSrc1) - LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - - Value *LoadSrc2 = nullptr; - if (auto *Source2C = dyn_cast(Source2)) - LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL); - if (!LoadSrc2) - LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (NumLoads != 1) { - if (LoadSizeType != MaxLoadType) { - LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); - } - // If we have multiple loads per block, we need to generate a composite - // comparison using xor+or. - Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); - Diff = Builder.CreateZExt(Diff, MaxLoadType); - XorList.push_back(Diff); - } else { - // If there's only one load per block, we just compare the loaded values. - Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); - } - } - - auto pairWiseOr = [&](std::vector &InList) -> std::vector { - std::vector OutList; - for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { - Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); - OutList.push_back(Or); - } - if (InList.size() % 2 != 0) - OutList.push_back(InList.back()); - return OutList; - }; - - if (!Cmp) { - // Pairwise OR the XOR results. - OrList = pairWiseOr(XorList); - - // Pairwise OR the OR results until one result left. - while (OrList.size() != 1) { - OrList = pairWiseOr(OrList); - } - Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); - } - - return Cmp; -} - -void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, - unsigned &LoadIndex) { - Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex); - - BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) - ? EndBlock - : LoadCmpBlocks[BlockIndex + 1]; - // Early exit branch if difference found to ResultBlock. Otherwise, - // continue to next LoadCmpBlock or EndBlock. - BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); - Builder.Insert(CmpBr); - - // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 - // since early exit to ResultBlock was not taken (no difference was found in - // any of the bytes). - if (BlockIndex == LoadCmpBlocks.size() - 1) { - Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); - PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); - } -} - -// This function creates the IR intructions for loading and comparing using the -// given LoadSize. It loads the number of bytes specified by LoadSize from each -// source of the memcmp parameters. It then does a subtract to see if there was -// a difference in the loaded values. If a difference is found, it branches -// with an early exit to the ResultBlock for calculating which source was -// larger. Otherwise, it falls through to the either the next LoadCmpBlock or -// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with -// a special case through emitLoadCompareByteBlock. The special handling can -// simply subtract the loaded values and add it to the result phi node. -void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { - // There is one load per block in this case, BlockIndex == LoadIndex. - const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex]; - - if (CurLoadEntry.LoadSize == 1) { - MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, - CurLoadEntry.getGEPIndex()); - return; - } - - Type *LoadSizeType = - IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); - Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); - assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type"); - - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using a GEP. - if (CurLoadEntry.Offset != 0) { - Source1 = Builder.CreateGEP( - LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - Source2 = Builder.CreateGEP( - LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - } - - // Load LoadSizeType from the base address. - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (DL.isLittleEndian()) { - Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::bswap, LoadSizeType); - LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); - LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); - } - - if (LoadSizeType != MaxLoadType) { - LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); - } - - // Add the loaded values to the phi nodes for calculating memcmp result only - // if result is not used in a zero equality. - if (!IsUsedForZeroCmp) { - ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]); - ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]); - } - - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2); - BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) - ? EndBlock - : LoadCmpBlocks[BlockIndex + 1]; - // Early exit branch if difference found to ResultBlock. Otherwise, continue - // to next LoadCmpBlock or EndBlock. - BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); - Builder.Insert(CmpBr); - - // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 - // since early exit to ResultBlock was not taken (no difference was found in - // any of the bytes). - if (BlockIndex == LoadCmpBlocks.size() - 1) { - Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); - PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); - } -} - -// This function populates the ResultBlock with a sequence to calculate the -// memcmp result. It compares the two loaded source values and returns -1 if -// src1 < src2 and 1 if src1 > src2. -void MemCmpExpansion::emitMemCmpResultBlock() { - // Special case: if memcmp result is used in a zero equality, result does not - // need to be calculated and can simply return 1. - if (IsUsedForZeroCmp) { - BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); - Builder.SetInsertPoint(ResBlock.BB, InsertPt); - Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); - PhiRes->addIncoming(Res, ResBlock.BB); - BranchInst *NewBr = BranchInst::Create(EndBlock); - Builder.Insert(NewBr); - return; - } - BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); - Builder.SetInsertPoint(ResBlock.BB, InsertPt); - - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, - ResBlock.PhiSrc2); - - Value *Res = - Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), - ConstantInt::get(Builder.getInt32Ty(), 1)); - - BranchInst *NewBr = BranchInst::Create(EndBlock); - Builder.Insert(NewBr); - PhiRes->addIncoming(Res, ResBlock.BB); -} - -void MemCmpExpansion::setupResultBlockPHINodes() { - Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); - Builder.SetInsertPoint(ResBlock.BB); - // Note: this assumes one load per block. - ResBlock.PhiSrc1 = - Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1"); - ResBlock.PhiSrc2 = - Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2"); -} - -void MemCmpExpansion::setupEndBlockPHINodes() { - Builder.SetInsertPoint(&EndBlock->front()); - PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); -} - -Value *MemCmpExpansion::getMemCmpExpansionZeroCase() { - unsigned LoadIndex = 0; - // This loop populates each of the LoadCmpBlocks with the IR sequence to - // handle multiple loads per block. - for (unsigned I = 0; I < getNumBlocks(); ++I) { - emitLoadCompareBlockMultipleLoads(I, LoadIndex); - } - - emitMemCmpResultBlock(); - return PhiRes; -} - -/// A memcmp expansion that compares equality with 0 and only has one block of -/// load and compare can bypass the compare, branch, and phi IR that is required -/// in the general case. -Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() { - unsigned LoadIndex = 0; - Value *Cmp = getCompareLoadPairs(0, LoadIndex); - assert(LoadIndex == getNumLoads() && "some entries were not consumed"); - return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); -} - -/// A memcmp expansion that only has one block of load and compare can bypass -/// the compare, branch, and phi IR that is required in the general case. -Value *MemCmpExpansion::getMemCmpOneBlock() { - assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); - - Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Load LoadSizeType from the base address. - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (DL.isLittleEndian() && Size != 1) { - Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::bswap, LoadSizeType); - LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); - LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); - } - - if (Size < 4) { - // The i8 and i16 cases don't need compares. We zext the loaded values and - // subtract them to get the suitable negative, zero, or positive i32 result. - LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty()); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty()); - return Builder.CreateSub(LoadSrc1, LoadSrc2); - } - - // The result of memcmp is negative, zero, or positive, so produce that by - // subtracting 2 extended compare bits: sub (ugt, ult). - // If a target prefers to use selects to get -1/0/1, they should be able - // to transform this later. The inverse transform (going from selects to math) - // may not be possible in the DAG because the selects got converted into - // branches before we got there. - Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); - Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); - Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); - Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); - return Builder.CreateSub(ZextUGT, ZextULT); -} - -// This function expands the memcmp call into an inline expansion and returns -// the memcmp result. -Value *MemCmpExpansion::getMemCmpExpansion() { - // A memcmp with zero-comparison with only one block of load and compare does - // not need to set up any extra blocks. This case could be handled in the DAG, - // but since we have all of the machinery to flexibly expand any memcpy here, - // we choose to handle this case too to avoid fragmented lowering. - if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) { - BasicBlock *StartBlock = CI->getParent(); - EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); - setupEndBlockPHINodes(); - createResultBlock(); - - // If return value of memcmp is not used in a zero equality, we need to - // calculate which source was larger. The calculation requires the - // two loaded source values of each load compare block. - // These will be saved in the phi nodes created by setupResultBlockPHINodes. - if (!IsUsedForZeroCmp) setupResultBlockPHINodes(); - - // Create the number of required load compare basic blocks. - createLoadCmpBlocks(); - - // Update the terminator added by splitBasicBlock to branch to the first - // LoadCmpBlock. - StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); - } - - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - if (IsUsedForZeroCmp) - return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock() - : getMemCmpExpansionZeroCase(); - - // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). - if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock(); - - for (unsigned I = 0; I < getNumBlocks(); ++I) { - emitLoadCompareBlock(I); - } - - emitMemCmpResultBlock(); - return PhiRes; -} - -// This function checks to see if an expansion of memcmp can be generated. -// It checks for constant compare size that is less than the max inline size. -// If an expansion cannot occur, returns false to leave as a library call. -// Otherwise, the library call is replaced with a new IR instruction sequence. -/// We want to transform: -/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) -/// To: -/// loadbb: -/// %0 = bitcast i32* %buffer2 to i8* -/// %1 = bitcast i32* %buffer1 to i8* -/// %2 = bitcast i8* %1 to i64* -/// %3 = bitcast i8* %0 to i64* -/// %4 = load i64, i64* %2 -/// %5 = load i64, i64* %3 -/// %6 = call i64 @llvm.bswap.i64(i64 %4) -/// %7 = call i64 @llvm.bswap.i64(i64 %5) -/// %8 = sub i64 %6, %7 -/// %9 = icmp ne i64 %8, 0 -/// br i1 %9, label %res_block, label %loadbb1 -/// res_block: ; preds = %loadbb2, -/// %loadbb1, %loadbb -/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] -/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] -/// %10 = icmp ult i64 %phi.src1, %phi.src2 -/// %11 = select i1 %10, i32 -1, i32 1 -/// br label %endblock -/// loadbb1: ; preds = %loadbb -/// %12 = bitcast i32* %buffer2 to i8* -/// %13 = bitcast i32* %buffer1 to i8* -/// %14 = bitcast i8* %13 to i32* -/// %15 = bitcast i8* %12 to i32* -/// %16 = getelementptr i32, i32* %14, i32 2 -/// %17 = getelementptr i32, i32* %15, i32 2 -/// %18 = load i32, i32* %16 -/// %19 = load i32, i32* %17 -/// %20 = call i32 @llvm.bswap.i32(i32 %18) -/// %21 = call i32 @llvm.bswap.i32(i32 %19) -/// %22 = zext i32 %20 to i64 -/// %23 = zext i32 %21 to i64 -/// %24 = sub i64 %22, %23 -/// %25 = icmp ne i64 %24, 0 -/// br i1 %25, label %res_block, label %loadbb2 -/// loadbb2: ; preds = %loadbb1 -/// %26 = bitcast i32* %buffer2 to i8* -/// %27 = bitcast i32* %buffer1 to i8* -/// %28 = bitcast i8* %27 to i16* -/// %29 = bitcast i8* %26 to i16* -/// %30 = getelementptr i16, i16* %28, i16 6 -/// %31 = getelementptr i16, i16* %29, i16 6 -/// %32 = load i16, i16* %30 -/// %33 = load i16, i16* %31 -/// %34 = call i16 @llvm.bswap.i16(i16 %32) -/// %35 = call i16 @llvm.bswap.i16(i16 %33) -/// %36 = zext i16 %34 to i64 -/// %37 = zext i16 %35 to i64 -/// %38 = sub i64 %36, %37 -/// %39 = icmp ne i64 %38, 0 -/// br i1 %39, label %res_block, label %loadbb3 -/// loadbb3: ; preds = %loadbb2 -/// %40 = bitcast i32* %buffer2 to i8* -/// %41 = bitcast i32* %buffer1 to i8* -/// %42 = getelementptr i8, i8* %41, i8 14 -/// %43 = getelementptr i8, i8* %40, i8 14 -/// %44 = load i8, i8* %42 -/// %45 = load i8, i8* %43 -/// %46 = zext i8 %44 to i32 -/// %47 = zext i8 %45 to i32 -/// %48 = sub i32 %46, %47 -/// br label %endblock -/// endblock: ; preds = %res_block, -/// %loadbb3 -/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] -/// ret i32 %phi.res -static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, - const TargetLowering *TLI, const DataLayout *DL) { - NumMemCmpCalls++; - - // Early exit from expansion if -Oz. - if (CI->getFunction()->optForMinSize()) - return false; - - // Early exit from expansion if size is not a constant. - ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); - if (!SizeCast) { - NumMemCmpNotConstant++; - return false; - } - const uint64_t SizeVal = SizeCast->getZExtValue(); - - if (SizeVal == 0) { - return false; - } - - // TTI call to check if target would like to expand memcmp. Also, get the - // available load sizes. - const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); - const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp); - if (!Options) return false; - - const unsigned MaxNumLoads = - TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize()); - - MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads, - IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL); - - // Don't expand if this will require more loads than desired by the target. - if (Expansion.getNumLoads() == 0) { - NumMemCmpGreaterThanMax++; - return false; - } - - NumMemCmpInlined++; - - Value *Res = Expansion.getMemCmpExpansion(); - - // Replace call with result of expansion and erase call. - CI->replaceAllUsesWith(Res); - CI->eraseFromParent(); - - return true; -} - bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { BasicBlock *BB = CI->getParent(); @@ -2542,12 +1838,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { return true; } - LibFunc Func; - if (TLInfo->getLibFunc(ImmutableCallSite(CI), Func) && - Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TLI, DL)) { - ModifiedDT = true; - return true; - } return false; } diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index c5101b1ecfc..59e88ba3bda 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -600,8 +600,14 @@ void TargetPassConfig::addIRPasses() { addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n")); } - if (getOptLevel() != CodeGenOpt::None && EnableMergeICmps) { - addPass(createMergeICmpsPass()); + if (getOptLevel() != CodeGenOpt::None) { + // The MergeICmpsPass tries to create memcmp calls by grouping sequences of + // loads and compares. ExpandMemCmpPass then tries to expand those calls + // into optimally-sized loads and compares. The transforms are enabled by a + // target lowering hook. + if (EnableMergeICmps) + addPass(createMergeICmpsPass()); + addPass(createExpandMemCmpPass()); } // Run GC lowering passes for builtin collectors diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index d79ae851005..164163d2131 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -9,6 +9,7 @@ add_llvm_library(LLVMScalarOpts DeadStoreElimination.cpp DivRemPairs.cpp EarlyCSE.cpp + ExpandMemCmp.cpp FlattenCFGPass.cpp Float2Int.cpp GuardWidening.cpp diff --git a/lib/Transforms/Scalar/ExpandMemCmp.cpp b/lib/Transforms/Scalar/ExpandMemCmp.cpp new file mode 100644 index 00000000000..0cd8c11422f --- /dev/null +++ b/lib/Transforms/Scalar/ExpandMemCmp.cpp @@ -0,0 +1,828 @@ +//===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to partially inline the fast path of well-known library +// functions, such as using square-root instructions for cases where sqrt() +// does not need to set errno. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "expandmemcmp" + +STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); +STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); +STATISTIC(NumMemCmpGreaterThanMax, + "Number of memcmp calls with size greater than max size"); +STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); + +static cl::opt MemCmpNumLoadsPerBlock( + "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), + cl::desc("The number of loads per basic block for inline expansion of " + "memcmp that is only being compared against zero.")); + +namespace { + + +// This class provides helper functions to expand a memcmp library call into an +// inline expansion. +class MemCmpExpansion { + struct ResultBlock { + BasicBlock *BB = nullptr; + PHINode *PhiSrc1 = nullptr; + PHINode *PhiSrc2 = nullptr; + + ResultBlock() = default; + }; + + CallInst *const CI; + ResultBlock ResBlock; + const uint64_t Size; + unsigned MaxLoadSize; + uint64_t NumLoadsNonOneByte; + const uint64_t NumLoadsPerBlock; + std::vector LoadCmpBlocks; + BasicBlock *EndBlock; + PHINode *PhiRes; + const bool IsUsedForZeroCmp; + const DataLayout &DL; + IRBuilder<> Builder; + // Represents the decomposition in blocks of the expansion. For example, + // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and + // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}. + // TODO(courbet): Involve the target more in this computation. On X86, 7 + // bytes can be done more efficiently with two overlaping 4-byte loads than + // covering the interval with [{4, 0},{2, 4},{1, 6}}. + struct LoadEntry { + LoadEntry(unsigned LoadSize, uint64_t Offset) + : LoadSize(LoadSize), Offset(Offset) { + assert(Offset % LoadSize == 0 && "invalid load entry"); + } + + uint64_t getGEPIndex() const { return Offset / LoadSize; } + + // The size of the load for this block, in bytes. + const unsigned LoadSize; + // The offset of this load WRT the base pointer, in bytes. + const uint64_t Offset; + }; + SmallVector LoadSequence; + + void createLoadCmpBlocks(); + void createResultBlock(); + void setupResultBlockPHINodes(); + void setupEndBlockPHINodes(); + Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex); + void emitLoadCompareBlock(unsigned BlockIndex); + void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, + unsigned &LoadIndex); + void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex); + void emitMemCmpResultBlock(); + Value *getMemCmpExpansionZeroCase(); + Value *getMemCmpEqZeroOneBlock(); + Value *getMemCmpOneBlock(); + + public: + MemCmpExpansion(CallInst *CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + unsigned NumLoadsPerBlock, const DataLayout &DL); + + unsigned getNumBlocks(); + uint64_t getNumLoads() const { return LoadSequence.size(); } + + Value *getMemCmpExpansion(); +}; + +// Initialize the basic block structure required for expansion of memcmp call +// with given maximum load size and memcmp size parameter. +// This structure includes: +// 1. A list of load compare blocks - LoadCmpBlocks. +// 2. An EndBlock, split from original instruction point, which is the block to +// return from. +// 3. ResultBlock, block to branch to for early exit when a +// LoadCmpBlock finds a difference. +MemCmpExpansion::MemCmpExpansion( + CallInst *const CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + const unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout) + : CI(CI), + Size(Size), + MaxLoadSize(0), + NumLoadsNonOneByte(0), + NumLoadsPerBlock(NumLoadsPerBlock), + IsUsedForZeroCmp(IsUsedForZeroCmp), + DL(TheDataLayout), + Builder(CI) { + assert(Size > 0 && "zero blocks"); + // Scale the max size down if the target can load more bytes than we need. + size_t LoadSizeIndex = 0; + while (LoadSizeIndex < Options.LoadSizes.size() && + Options.LoadSizes[LoadSizeIndex] > Size) { + ++LoadSizeIndex; + } + this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; + // Compute the decomposition. + uint64_t CurSize = Size; + uint64_t Offset = 0; + while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { + const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; + assert(LoadSize > 0 && "zero load size"); + const uint64_t NumLoadsForThisSize = CurSize / LoadSize; + if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { + // Do not expand if the total number of loads is larger than what the + // target allows. Note that it's important that we exit before completing + // the expansion to avoid using a ton of memory to store the expansion for + // large sizes. + LoadSequence.clear(); + return; + } + if (NumLoadsForThisSize > 0) { + for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { + LoadSequence.push_back({LoadSize, Offset}); + Offset += LoadSize; + } + if (LoadSize > 1) { + ++NumLoadsNonOneByte; + } + CurSize = CurSize % LoadSize; + } + ++LoadSizeIndex; + } + assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); +} + +unsigned MemCmpExpansion::getNumBlocks() { + if (IsUsedForZeroCmp) + return getNumLoads() / NumLoadsPerBlock + + (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0); + return getNumLoads(); +} + +void MemCmpExpansion::createLoadCmpBlocks() { + for (unsigned i = 0; i < getNumBlocks(); i++) { + BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", + EndBlock->getParent(), EndBlock); + LoadCmpBlocks.push_back(BB); + } +} + +void MemCmpExpansion::createResultBlock() { + ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", + EndBlock->getParent(), EndBlock); +} + +// This function creates the IR instructions for loading and comparing 1 byte. +// It loads 1 byte from each source of the memcmp parameters with the given +// GEPIndex. It then subtracts the two loaded values and adds this result to the +// final phi node for selecting the memcmp result. +void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, + unsigned GEPIndex) { + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex. + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); + Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + + PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]); + + if (BlockIndex < (LoadCmpBlocks.size() - 1)) { + // Early exit branch if difference found to EndBlock. Otherwise, continue to + // next LoadCmpBlock, + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, + ConstantInt::get(Diff->getType(), 0)); + BranchInst *CmpBr = + BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp); + Builder.Insert(CmpBr); + } else { + // The last block has an unconditional branch to EndBlock. + BranchInst *CmpBr = BranchInst::Create(EndBlock); + Builder.Insert(CmpBr); + } +} + +/// Generate an equality comparison for one or more pairs of loaded values. +/// This is used in the case where the memcmp() call is compared equal or not +/// equal to zero. +Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex, + unsigned &LoadIndex) { + assert(LoadIndex < getNumLoads() && + "getCompareLoadPairs() called with no remaining loads"); + std::vector XorList, OrList; + Value *Diff; + + const unsigned NumLoads = + std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock); + + // For a single-block expansion, start inserting before the memcmp call. + if (LoadCmpBlocks.empty()) + Builder.SetInsertPoint(CI); + else + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + + Value *Cmp = nullptr; + // If we have multiple loads per block, we need to generate a composite + // comparison using xor+or. The type for the combinations is the largest load + // type. + IntegerType *const MaxLoadType = + NumLoads == 1 ? nullptr + : IntegerType::get(CI->getContext(), MaxLoadSize * 8); + for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) { + const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex]; + + IntegerType *LoadSizeType = + IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using a GEP. + if (CurLoadEntry.Offset != 0) { + Source1 = Builder.CreateGEP( + LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + Source2 = Builder.CreateGEP( + LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + } + + // Get a constant or load a value for each source address. + Value *LoadSrc1 = nullptr; + if (auto *Source1C = dyn_cast(Source1)) + LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL); + if (!LoadSrc1) + LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + + Value *LoadSrc2 = nullptr; + if (auto *Source2C = dyn_cast(Source2)) + LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL); + if (!LoadSrc2) + LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (NumLoads != 1) { + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); + } + // If we have multiple loads per block, we need to generate a composite + // comparison using xor+or. + Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); + Diff = Builder.CreateZExt(Diff, MaxLoadType); + XorList.push_back(Diff); + } else { + // If there's only one load per block, we just compare the loaded values. + Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); + } + } + + auto pairWiseOr = [&](std::vector &InList) -> std::vector { + std::vector OutList; + for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { + Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); + OutList.push_back(Or); + } + if (InList.size() % 2 != 0) + OutList.push_back(InList.back()); + return OutList; + }; + + if (!Cmp) { + // Pairwise OR the XOR results. + OrList = pairWiseOr(XorList); + + // Pairwise OR the OR results until one result left. + while (OrList.size() != 1) { + OrList = pairWiseOr(OrList); + } + Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); + } + + return Cmp; +} + +void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, + unsigned &LoadIndex) { + Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex); + + BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[BlockIndex + 1]; + // Early exit branch if difference found to ResultBlock. Otherwise, + // continue to next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes). + if (BlockIndex == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); + } +} + +// This function creates the IR intructions for loading and comparing using the +// given LoadSize. It loads the number of bytes specified by LoadSize from each +// source of the memcmp parameters. It then does a subtract to see if there was +// a difference in the loaded values. If a difference is found, it branches +// with an early exit to the ResultBlock for calculating which source was +// larger. Otherwise, it falls through to the either the next LoadCmpBlock or +// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with +// a special case through emitLoadCompareByteBlock. The special handling can +// simply subtract the loaded values and add it to the result phi node. +void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { + // There is one load per block in this case, BlockIndex == LoadIndex. + const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex]; + + if (CurLoadEntry.LoadSize == 1) { + MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, + CurLoadEntry.getGEPIndex()); + return; + } + + Type *LoadSizeType = + IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type"); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using a GEP. + if (CurLoadEntry.Offset != 0) { + Source1 = Builder.CreateGEP( + LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + Source2 = Builder.CreateGEP( + LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + } + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian()) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); + } + + // Add the loaded values to the phi nodes for calculating memcmp result only + // if result is not used in a zero equality. + if (!IsUsedForZeroCmp) { + ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]); + ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]); + } + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2); + BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[BlockIndex + 1]; + // Early exit branch if difference found to ResultBlock. Otherwise, continue + // to next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes). + if (BlockIndex == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); + } +} + +// This function populates the ResultBlock with a sequence to calculate the +// memcmp result. It compares the two loaded source values and returns -1 if +// src1 < src2 and 1 if src1 > src2. +void MemCmpExpansion::emitMemCmpResultBlock() { + // Special case: if memcmp result is used in a zero equality, result does not + // need to be calculated and can simply return 1. + if (IsUsedForZeroCmp) { + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); + PhiRes->addIncoming(Res, ResBlock.BB); + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + return; + } + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, + ResBlock.PhiSrc2); + + Value *Res = + Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), + ConstantInt::get(Builder.getInt32Ty(), 1)); + + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + PhiRes->addIncoming(Res, ResBlock.BB); +} + +void MemCmpExpansion::setupResultBlockPHINodes() { + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + Builder.SetInsertPoint(ResBlock.BB); + // Note: this assumes one load per block. + ResBlock.PhiSrc1 = + Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1"); + ResBlock.PhiSrc2 = + Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2"); +} + +void MemCmpExpansion::setupEndBlockPHINodes() { + Builder.SetInsertPoint(&EndBlock->front()); + PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); +} + +Value *MemCmpExpansion::getMemCmpExpansionZeroCase() { + unsigned LoadIndex = 0; + // This loop populates each of the LoadCmpBlocks with the IR sequence to + // handle multiple loads per block. + for (unsigned I = 0; I < getNumBlocks(); ++I) { + emitLoadCompareBlockMultipleLoads(I, LoadIndex); + } + + emitMemCmpResultBlock(); + return PhiRes; +} + +/// A memcmp expansion that compares equality with 0 and only has one block of +/// load and compare can bypass the compare, branch, and phi IR that is required +/// in the general case. +Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() { + unsigned LoadIndex = 0; + Value *Cmp = getCompareLoadPairs(0, LoadIndex); + assert(LoadIndex == getNumLoads() && "some entries were not consumed"); + return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); +} + +/// A memcmp expansion that only has one block of load and compare can bypass +/// the compare, branch, and phi IR that is required in the general case. +Value *MemCmpExpansion::getMemCmpOneBlock() { + assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); + + Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian() && Size != 1) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (Size < 4) { + // The i8 and i16 cases don't need compares. We zext the loaded values and + // subtract them to get the suitable negative, zero, or positive i32 result. + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty()); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty()); + return Builder.CreateSub(LoadSrc1, LoadSrc2); + } + + // The result of memcmp is negative, zero, or positive, so produce that by + // subtracting 2 extended compare bits: sub (ugt, ult). + // If a target prefers to use selects to get -1/0/1, they should be able + // to transform this later. The inverse transform (going from selects to math) + // may not be possible in the DAG because the selects got converted into + // branches before we got there. + Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); + Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); + Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); + Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); + return Builder.CreateSub(ZextUGT, ZextULT); +} + +// This function expands the memcmp call into an inline expansion and returns +// the memcmp result. +Value *MemCmpExpansion::getMemCmpExpansion() { + // A memcmp with zero-comparison with only one block of load and compare does + // not need to set up any extra blocks. This case could be handled in the DAG, + // but since we have all of the machinery to flexibly expand any memcpy here, + // we choose to handle this case too to avoid fragmented lowering. + if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) { + BasicBlock *StartBlock = CI->getParent(); + EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + setupEndBlockPHINodes(); + createResultBlock(); + + // If return value of memcmp is not used in a zero equality, we need to + // calculate which source was larger. The calculation requires the + // two loaded source values of each load compare block. + // These will be saved in the phi nodes created by setupResultBlockPHINodes. + if (!IsUsedForZeroCmp) setupResultBlockPHINodes(); + + // Create the number of required load compare basic blocks. + createLoadCmpBlocks(); + + // Update the terminator added by splitBasicBlock to branch to the first + // LoadCmpBlock. + StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); + } + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + if (IsUsedForZeroCmp) + return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock() + : getMemCmpExpansionZeroCase(); + + // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). + if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock(); + + for (unsigned I = 0; I < getNumBlocks(); ++I) { + emitLoadCompareBlock(I); + } + + emitMemCmpResultBlock(); + return PhiRes; +} + +// This function checks to see if an expansion of memcmp can be generated. +// It checks for constant compare size that is less than the max inline size. +// If an expansion cannot occur, returns false to leave as a library call. +// Otherwise, the library call is replaced with a new IR instruction sequence. +/// We want to transform: +/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) +/// To: +/// loadbb: +/// %0 = bitcast i32* %buffer2 to i8* +/// %1 = bitcast i32* %buffer1 to i8* +/// %2 = bitcast i8* %1 to i64* +/// %3 = bitcast i8* %0 to i64* +/// %4 = load i64, i64* %2 +/// %5 = load i64, i64* %3 +/// %6 = call i64 @llvm.bswap.i64(i64 %4) +/// %7 = call i64 @llvm.bswap.i64(i64 %5) +/// %8 = sub i64 %6, %7 +/// %9 = icmp ne i64 %8, 0 +/// br i1 %9, label %res_block, label %loadbb1 +/// res_block: ; preds = %loadbb2, +/// %loadbb1, %loadbb +/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] +/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] +/// %10 = icmp ult i64 %phi.src1, %phi.src2 +/// %11 = select i1 %10, i32 -1, i32 1 +/// br label %endblock +/// loadbb1: ; preds = %loadbb +/// %12 = bitcast i32* %buffer2 to i8* +/// %13 = bitcast i32* %buffer1 to i8* +/// %14 = bitcast i8* %13 to i32* +/// %15 = bitcast i8* %12 to i32* +/// %16 = getelementptr i32, i32* %14, i32 2 +/// %17 = getelementptr i32, i32* %15, i32 2 +/// %18 = load i32, i32* %16 +/// %19 = load i32, i32* %17 +/// %20 = call i32 @llvm.bswap.i32(i32 %18) +/// %21 = call i32 @llvm.bswap.i32(i32 %19) +/// %22 = zext i32 %20 to i64 +/// %23 = zext i32 %21 to i64 +/// %24 = sub i64 %22, %23 +/// %25 = icmp ne i64 %24, 0 +/// br i1 %25, label %res_block, label %loadbb2 +/// loadbb2: ; preds = %loadbb1 +/// %26 = bitcast i32* %buffer2 to i8* +/// %27 = bitcast i32* %buffer1 to i8* +/// %28 = bitcast i8* %27 to i16* +/// %29 = bitcast i8* %26 to i16* +/// %30 = getelementptr i16, i16* %28, i16 6 +/// %31 = getelementptr i16, i16* %29, i16 6 +/// %32 = load i16, i16* %30 +/// %33 = load i16, i16* %31 +/// %34 = call i16 @llvm.bswap.i16(i16 %32) +/// %35 = call i16 @llvm.bswap.i16(i16 %33) +/// %36 = zext i16 %34 to i64 +/// %37 = zext i16 %35 to i64 +/// %38 = sub i64 %36, %37 +/// %39 = icmp ne i64 %38, 0 +/// br i1 %39, label %res_block, label %loadbb3 +/// loadbb3: ; preds = %loadbb2 +/// %40 = bitcast i32* %buffer2 to i8* +/// %41 = bitcast i32* %buffer1 to i8* +/// %42 = getelementptr i8, i8* %41, i8 14 +/// %43 = getelementptr i8, i8* %40, i8 14 +/// %44 = load i8, i8* %42 +/// %45 = load i8, i8* %43 +/// %46 = zext i8 %44 to i32 +/// %47 = zext i8 %45 to i32 +/// %48 = sub i32 %46, %47 +/// br label %endblock +/// endblock: ; preds = %res_block, +/// %loadbb3 +/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] +/// ret i32 %phi.res +static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, + const TargetLowering *TLI, const DataLayout *DL) { + NumMemCmpCalls++; + + // Early exit from expansion if -Oz. + if (CI->getFunction()->optForMinSize()) + return false; + + // Early exit from expansion if size is not a constant. + ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); + if (!SizeCast) { + NumMemCmpNotConstant++; + return false; + } + const uint64_t SizeVal = SizeCast->getZExtValue(); + + if (SizeVal == 0) { + return false; + } + + // TTI call to check if target would like to expand memcmp. Also, get the + // available load sizes. + const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); + const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp); + if (!Options) return false; + + const unsigned MaxNumLoads = + TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize()); + + MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads, + IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL); + + // Don't expand if this will require more loads than desired by the target. + if (Expansion.getNumLoads() == 0) { + NumMemCmpGreaterThanMax++; + return false; + } + + NumMemCmpInlined++; + + Value *Res = Expansion.getMemCmpExpansion(); + + // Replace call with result of expansion and erase call. + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + + return true; +} + + + +class ExpandMemCmpPass : public FunctionPass { +public: + static char ID; + + ExpandMemCmpPass() : FunctionPass(ID) { + initializeExpandMemCmpPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) return false; + + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) { + return false; + } + const TargetLowering* TL = + TPC->getTM().getSubtargetImpl(F)->getTargetLowering(); + + const TargetLibraryInfo *TLI = + &getAnalysis().getTLI(); + const TargetTransformInfo *TTI = + &getAnalysis().getTTI(F); + auto PA = runImpl(F, TLI, TTI, TL); + return !PA.areAllPreserved(); + } + +private: + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + FunctionPass::getAnalysisUsage(AU); + } + + PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, + const TargetLowering* TL); + // Returns true if a change was made. + bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, const TargetLowering* TL, + const DataLayout& DL); +}; + +bool ExpandMemCmpPass::runOnBlock( + BasicBlock &BB, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, const TargetLowering* TL, + const DataLayout& DL) { + for (Instruction& I : BB) { + CallInst *CI = dyn_cast(&I); + if (!CI) { + continue; + } + LibFunc Func; + if (TLI->getLibFunc(ImmutableCallSite(CI), Func) && + Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TL, &DL)) { + return true; + } + } + return false; +} + + +PreservedAnalyses ExpandMemCmpPass::runImpl( + Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, + const TargetLowering* TL) { + const DataLayout& DL = F.getParent()->getDataLayout(); + bool MadeChanges = false; + for (auto BBIt = F.begin(); BBIt != F.end();) { + if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) { + MadeChanges = true; + // If changes were made, restart the function from the beginning, since + // the structure of the function was changed. + BBIt = F.begin(); + } else { + ++BBIt; + } + } + return MadeChanges ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} + +} // namespace + +char ExpandMemCmpPass::ID = 0; +INITIALIZE_PASS_BEGIN(ExpandMemCmpPass, "expandmemcmp", + "Expand memcmp() to load/stores", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp", + "Expand memcmp() to load/stores", false, false) + +Pass *llvm::createExpandMemCmpPass() { + return new ExpandMemCmpPass(); +} diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index c1034ace206..4b694cecea6 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -48,6 +48,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeNewGVNLegacyPassPass(Registry); initializeEarlyCSELegacyPassPass(Registry); initializeEarlyCSEMemSSALegacyPassPass(Registry); + initializeExpandMemCmpPassPass(Registry); initializeGVNHoistLegacyPassPass(Registry); initializeGVNSinkLegacyPassPass(Registry); initializeFlattenCFGPassPass(Registry); diff --git a/test/CodeGen/Generic/llc-start-stop.ll b/test/CodeGen/Generic/llc-start-stop.ll index 85b69c37aa0..9056e2cab49 100644 --- a/test/CodeGen/Generic/llc-start-stop.ll +++ b/test/CodeGen/Generic/llc-start-stop.ll @@ -13,15 +13,15 @@ ; STOP-BEFORE-NOT: Loop Strength Reduction ; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER -; START-AFTER: -machine-branch-prob -gc-lowering +; START-AFTER: -machine-branch-prob -expandmemcmp ; START-AFTER: FunctionPass Manager -; START-AFTER-NEXT: Lower Garbage Collection Instructions +; START-AFTER-NEXT: Expand memcmp() to load/stores ; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE ; START-BEFORE: -machine-branch-prob -domtree ; START-BEFORE: FunctionPass Manager ; START-BEFORE: Loop Strength Reduction -; START-BEFORE-NEXT: Lower Garbage Collection Instructions +; START-BEFORE-NEXT: Expand memcmp() to load/stores ; RUN: not llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE ; RUN: not llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll index 77d9fa69182..3f5eeba7055 100644 --- a/test/CodeGen/X86/memcmp-optsize.ll +++ b/test/CodeGen/X86/memcmp-optsize.ll @@ -156,36 +156,36 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize { define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length3_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: cmpw (%ecx), %dx -; X86-NEXT: jne .LBB5_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 2(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: cmpw (%eax), %dx +; X86-NEXT: jne .LBB5_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 2(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 2(%eax), %dl ; X86-NEXT: je .LBB5_3 -; X86-NEXT: .LBB5_1: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax +; X86-NEXT: .LBB5_2: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: .LBB5_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB5_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB5_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 2(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 2(%rsi), %cl ; X64-NEXT: je .LBB5_3 -; X64-NEXT: .LBB5_1: # %res_block +; X64-NEXT: .LBB5_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB5_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -314,36 +314,36 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize { define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length5_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB10_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 4(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB10_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 4(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 4(%eax), %dl ; X86-NEXT: je .LBB10_3 -; X86-NEXT: .LBB10_1: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax +; X86-NEXT: .LBB10_2: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: .LBB10_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB10_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB10_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 4(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 4(%rsi), %cl ; X64-NEXT: je .LBB10_3 -; X64-NEXT: .LBB10_1: # %res_block +; X64-NEXT: .LBB10_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB10_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -356,7 +356,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -365,8 +365,8 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB11_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB11_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx ; X86-NEXT: bswapl %ecx @@ -374,7 +374,7 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: je .LBB11_3 -; X86-NEXT: .LBB11_1: # %res_block +; X86-NEXT: .LBB11_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al @@ -400,22 +400,22 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB12_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB12_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 4(%eax), %edx ; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_1: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax +; X86-NEXT: .LBB12_2: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -432,15 +432,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq_const(i8* %X) nounwind optsize { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB13_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 ; X86-NEXT: je .LBB13_3 -; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: .LBB13_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: incl %eax ; X86-NEXT: .LBB13_3: # %endblock @@ -473,16 +473,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB14_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB14_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl 8(%rsi), %ecx ; X64-NEXT: je .LBB14_3 -; X64-NEXT: .LBB14_1: # %res_block +; X64-NEXT: .LBB14_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB14_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -505,28 +505,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB15_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB15_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB15_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB15_1: # %res_block +; X64-NEXT: je .LBB15_3 +; X64-NEXT: .LBB15_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB15_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m @@ -546,28 +545,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB16_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: je .LBB16_3 +; X64-NEXT: .LBB16_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m @@ -701,19 +699,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB20_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB20_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx ; X64-SSE2-NEXT: je .LBB20_3 -; X64-SSE2-NEXT: .LBB20_1: # %res_block +; X64-SSE2-NEXT: .LBB20_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB20_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -721,18 +719,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB20_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB20_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: movq 16(%rdi), %rcx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx ; X64-AVX2-NEXT: je .LBB20_3 -; X64-AVX2-NEXT: .LBB20_1: # %res_block +; X64-AVX2-NEXT: .LBB20_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB20_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -757,18 +755,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB21_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB21_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) ; X64-SSE2-NEXT: je .LBB21_3 -; X64-SSE2-NEXT: .LBB21_1: # %res_block +; X64-SSE2-NEXT: .LBB21_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB21_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -776,18 +774,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq_const: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB21_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB21_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi) ; X64-AVX2-NEXT: je .LBB21_3 -; X64-AVX2-NEXT: .LBB21_1: # %res_block +; X64-AVX2-NEXT: .LBB21_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB21_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -833,7 +831,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-NOSSE-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 @@ -841,8 +839,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB23_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB23_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -850,7 +848,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB23_3 -; X86-SSE2-NEXT: .LBB23_1: # %res_block +; X86-SSE2-NEXT: .LBB23_2: # %res_block ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: incl %eax ; X86-SSE2-NEXT: .LBB23_3: # %endblock @@ -859,14 +857,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB23_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -874,7 +872,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_1: # %res_block +; X64-SSE2-NEXT: .LBB23_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -909,21 +907,21 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize { ; X86-NOSSE-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB24_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB24_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB24_3 -; X86-SSE2-NEXT: .LBB24_1: # %res_block +; X86-SSE2-NEXT: .LBB24_2: # %res_block ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: incl %eax ; X86-SSE2-NEXT: .LBB24_3: # %endblock @@ -932,20 +930,20 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB24_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB24_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB24_3 -; X64-SSE2-NEXT: .LBB24_1: # %res_block +; X64-SSE2-NEXT: .LBB24_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB24_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -1009,20 +1007,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB26_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB26_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB26_3 -; X64-AVX2-NEXT: .LBB26_1: # %res_block +; X64-AVX2-NEXT: .LBB26_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB26_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -1059,20 +1057,20 @@ define i1 @length64_eq_const(i8* %X) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq_const: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB27_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB27_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB27_3 -; X64-AVX2-NEXT: .LBB27_1: # %res_block +; X64-AVX2-NEXT: .LBB27_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB27_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index 393e4c42d8b..84fd45b0a08 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -187,35 +187,35 @@ define i32 @length3(i8* %X, i8* %Y) nounwind { define i1 @length3_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length3_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: cmpw (%ecx), %dx -; X86-NEXT: jne .LBB7_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 2(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: cmpw (%eax), %dx +; X86-NEXT: jne .LBB7_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 2(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 2(%eax), %dl ; X86-NEXT: je .LBB7_3 -; X86-NEXT: .LBB7_1: # %res_block -; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB7_2: # %res_block +; X86-NEXT: movl $1, %ecx ; X86-NEXT: .LBB7_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB7_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB7_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 2(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 2(%rsi), %cl ; X64-NEXT: je .LBB7_3 -; X64-NEXT: .LBB7_1: # %res_block +; X64-NEXT: .LBB7_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB7_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -344,35 +344,35 @@ define i32 @length5(i8* %X, i8* %Y) nounwind { define i1 @length5_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length5_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB12_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 4(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB12_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 4(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 4(%eax), %dl ; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_1: # %res_block -; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB12_2: # %res_block +; X86-NEXT: movl $1, %ecx ; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB12_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB12_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 4(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 4(%rsi), %cl ; X64-NEXT: je .LBB12_3 -; X64-NEXT: .LBB12_1: # %res_block +; X64-NEXT: .LBB12_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB12_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -385,7 +385,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind { define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -394,23 +394,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB13_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # BB#3: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: je .LBB13_3 +; X86-NEXT: .LBB13_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al ; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB13_3: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -431,21 +429,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { define i1 @length8_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB14_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB14_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 4(%eax), %edx ; X86-NEXT: je .LBB14_3 -; X86-NEXT: .LBB14_1: # %res_block -; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB14_2: # %res_block +; X86-NEXT: movl $1, %ecx ; X86-NEXT: .LBB14_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -462,15 +460,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind { define i1 @length8_eq_const(i8* %X) nounwind { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB15_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB15_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 ; X86-NEXT: je .LBB15_3 -; X86-NEXT: .LBB15_1: # %res_block +; X86-NEXT: .LBB15_2: # %res_block ; X86-NEXT: movl $1, %eax ; X86-NEXT: .LBB15_3: # %endblock ; X86-NEXT: testl %eax, %eax @@ -502,16 +500,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB16_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl 8(%rsi), %ecx ; X64-NEXT: je .LBB16_3 -; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: .LBB16_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -534,28 +532,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB17_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB17_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB17_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB17_1: # %res_block +; X64-NEXT: je .LBB17_3 +; X64-NEXT: .LBB17_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB17_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m @@ -575,28 +572,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB18_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB18_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB18_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB18_1: # %res_block +; X64-NEXT: je .LBB18_3 +; X64-NEXT: .LBB18_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB18_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m @@ -754,19 +750,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB22_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB22_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx ; X64-SSE2-NEXT: je .LBB22_3 -; X64-SSE2-NEXT: .LBB22_1: # %res_block +; X64-SSE2-NEXT: .LBB22_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB22_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -774,18 +770,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq: -; X64-AVX: # BB#0: # %loadbb +; X64-AVX: # BB#0: ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB22_1 -; X64-AVX-NEXT: # BB#2: # %loadbb1 +; X64-AVX-NEXT: jne .LBB22_2 +; X64-AVX-NEXT: # BB#1: # %loadbb1 ; X64-AVX-NEXT: movq 16(%rdi), %rcx ; X64-AVX-NEXT: xorl %eax, %eax ; X64-AVX-NEXT: cmpq 16(%rsi), %rcx ; X64-AVX-NEXT: je .LBB22_3 -; X64-AVX-NEXT: .LBB22_1: # %res_block +; X64-AVX-NEXT: .LBB22_2: # %res_block ; X64-AVX-NEXT: movl $1, %eax ; X64-AVX-NEXT: .LBB22_3: # %endblock ; X64-AVX-NEXT: testl %eax, %eax @@ -810,18 +806,18 @@ define i1 @length24_eq_const(i8* %X) nounwind { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB23_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) ; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_1: # %res_block +; X64-SSE2-NEXT: .LBB23_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -829,18 +825,18 @@ define i1 @length24_eq_const(i8* %X) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq_const: -; X64-AVX: # BB#0: # %loadbb +; X64-AVX: # BB#0: ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB23_1 -; X64-AVX-NEXT: # BB#2: # %loadbb1 +; X64-AVX-NEXT: jne .LBB23_2 +; X64-AVX-NEXT: # BB#1: # %loadbb1 ; X64-AVX-NEXT: xorl %eax, %eax ; X64-AVX-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-AVX-NEXT: cmpq %rcx, 16(%rdi) ; X64-AVX-NEXT: je .LBB23_3 -; X64-AVX-NEXT: .LBB23_1: # %res_block +; X64-AVX-NEXT: .LBB23_2: # %res_block ; X64-AVX-NEXT: movl $1, %eax ; X64-AVX-NEXT: .LBB23_3: # %endblock ; X64-AVX-NEXT: testl %eax, %eax @@ -898,7 +894,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 @@ -906,8 +902,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB25_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB25_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -915,7 +911,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB25_3 -; X86-SSE2-NEXT: .LBB25_1: # %res_block +; X86-SSE2-NEXT: .LBB25_2: # %res_block ; X86-SSE2-NEXT: movl $1, %eax ; X86-SSE2-NEXT: .LBB25_3: # %endblock ; X86-SSE2-NEXT: testl %eax, %eax @@ -923,14 +919,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB25_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB25_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -938,7 +934,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB25_3 -; X64-SSE2-NEXT: .LBB25_1: # %res_block +; X64-SSE2-NEXT: .LBB25_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB25_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -946,20 +942,20 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq: -; X64-AVX1: # BB#0: # %loadbb +; X64-AVX1: # BB#0: ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB25_1 -; X64-AVX1-NEXT: # BB#2: # %loadbb1 +; X64-AVX1-NEXT: jne .LBB25_2 +; X64-AVX1-NEXT: # BB#1: # %loadbb1 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-AVX1-NEXT: je .LBB25_3 -; X64-AVX1-NEXT: .LBB25_1: # %res_block +; X64-AVX1-NEXT: .LBB25_2: # %res_block ; X64-AVX1-NEXT: movl $1, %eax ; X64-AVX1-NEXT: .LBB25_3: # %endblock ; X64-AVX1-NEXT: testl %eax, %eax @@ -1006,21 +1002,21 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB26_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB26_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB26_3 -; X86-SSE2-NEXT: .LBB26_1: # %res_block +; X86-SSE2-NEXT: .LBB26_2: # %res_block ; X86-SSE2-NEXT: movl $1, %eax ; X86-SSE2-NEXT: .LBB26_3: # %endblock ; X86-SSE2-NEXT: testl %eax, %eax @@ -1028,20 +1024,20 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB26_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB26_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB26_3 -; X64-SSE2-NEXT: .LBB26_1: # %res_block +; X64-SSE2-NEXT: .LBB26_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB26_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -1049,20 +1045,20 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq_const: -; X64-AVX1: # BB#0: # %loadbb +; X64-AVX1: # BB#0: ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB26_1 -; X64-AVX1-NEXT: # BB#2: # %loadbb1 +; X64-AVX1-NEXT: jne .LBB26_2 +; X64-AVX1-NEXT: # BB#1: # %loadbb1 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-AVX1-NEXT: je .LBB26_3 -; X64-AVX1-NEXT: .LBB26_1: # %res_block +; X64-AVX1-NEXT: .LBB26_2: # %res_block ; X64-AVX1-NEXT: movl $1, %eax ; X64-AVX1-NEXT: .LBB26_3: # %endblock ; X64-AVX1-NEXT: testl %eax, %eax @@ -1136,20 +1132,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind { ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB28_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB28_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB28_3 -; X64-AVX2-NEXT: .LBB28_1: # %res_block +; X64-AVX2-NEXT: .LBB28_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB28_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -1197,20 +1193,20 @@ define i1 @length64_eq_const(i8* %X) nounwind { ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq_const: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB29_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB29_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB29_3 -; X64-AVX2-NEXT: .LBB29_1: # %res_block +; X64-AVX2-NEXT: .LBB29_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB29_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/CodeGenPrepare/X86/memcmp.ll deleted file mode 100644 index a4f635c956d..00000000000 --- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll +++ /dev/null @@ -1,771 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -codegenprepare -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32 -; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 - -declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) - -define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp2( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]]) -; ALL-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 -; ALL-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32 -; ALL-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] -; ALL-NEXT: ret i32 [[TMP9]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) - ret i32 %call -} - -define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp3( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) -; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = icmp eq i16 [[TMP4]], [[TMP5]] -; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; ALL: res_block: -; ALL-NEXT: [[TMP7:%.*]] = icmp ult i16 [[TMP4]], [[TMP5]] -; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 2 -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 2 -; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] -; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 -; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] -; ALL-NEXT: br label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; ALL-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) - ret i32 %call -} - -define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp4( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) -; ALL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] -; ALL-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; ALL-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; ALL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] -; ALL-NEXT: ret i32 [[TMP11]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) - ret i32 %call -} - -define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp5( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] -; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; ALL: res_block: -; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]] -; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] -; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 -; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] -; ALL-NEXT: br label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; ALL-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) - ret i32 %call -} - -define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp6( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] -; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; ALL: res_block: -; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] -; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* -; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* -; ALL-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2 -; ALL-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 -; ALL-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] -; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] -; ALL-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) -; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) -; ALL-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i32 -; ALL-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 -; ALL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]] -; ALL-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; ALL-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) - ret i32 %call -} - -define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp7( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) -; ALL-NEXT: ret i32 [[CALL]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) - ret i32 %call -} - -define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp8( -; X32-NEXT: loadbb: -; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] -; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; X32: res_block: -; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] -; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] -; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; X32-NEXT: br label [[ENDBLOCK:%.*]] -; X32: loadbb1: -; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* -; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* -; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 -; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 -; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] -; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] -; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) -; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) -; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] -; X32-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; X32: endblock: -; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; X32-NEXT: ret i32 [[PHI_RES]] -; -; X64-LABEL: @cmp8( -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]]) -; X64-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]] -; X64-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]] -; X64-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; X64-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; X64-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] -; X64-NEXT: ret i32 [[TMP11]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) - ret i32 %call -} - -define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp9( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) -; X32-NEXT: ret i32 [[CALL]] -; -; X64-LABEL: @cmp9( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; X64: res_block: -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] -; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; X64-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 -; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; X64-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] -; X64-NEXT: br label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; X64-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) - ret i32 %call -} - -define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp10( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) -; X32-NEXT: ret i32 [[CALL]] -; -; X64-LABEL: @cmp10( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* -; X64-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4 -; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 -; X64-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] -; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) -; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) -; X64-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i64 -; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64 -; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] -; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; X64-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) - ret i32 %call -} - -define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp11( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; ALL-NEXT: ret i32 [[CALL]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) - ret i32 %call -} - -define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp12( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) -; X32-NEXT: ret i32 [[CALL]] -; -; X64-LABEL: @cmp12( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* -; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 -; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 -; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] -; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) -; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) -; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 -; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 -; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] -; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; X64-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) - ret i32 %call -} - -define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp13( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; ALL-NEXT: ret i32 [[CALL]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) - ret i32 %call -} - -define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp14( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; ALL-NEXT: ret i32 [[CALL]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) - ret i32 %call -} - -define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp15( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; ALL-NEXT: ret i32 [[CALL]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) - ret i32 %call -} - -define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp16( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) -; X32-NEXT: ret i32 [[CALL]] -; -; X64-LABEL: @cmp16( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i64* -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i64* -; X64-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 1 -; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1 -; X64-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP11]] -; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] -; X64-NEXT: [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]]) -; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) -; X64-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP16]] -; X64-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; X64-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) - ret i32 %call -} - -define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq2( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] -; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq3( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = icmp ne i16 [[TMP2]], [[TMP3]] -; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; ALL: res_block: -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 2 -; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 2 -; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] -; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq4( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] -; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq5( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] -; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; ALL: res_block: -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] -; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq6( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] -; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; ALL: res_block: -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* -; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* -; ALL-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2 -; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 -; ALL-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] -; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] -; ALL-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] -; ALL-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq7( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq8( -; X32-NEXT: loadbb: -; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] -; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X32: res_block: -; X32-NEXT: br label [[ENDBLOCK:%.*]] -; X32: loadbb1: -; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* -; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* -; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 -; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 -; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] -; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] -; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] -; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X32: endblock: -; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64-LABEL: @cmp_eq8( -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq9( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64-LABEL: @cmp_eq9( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] -; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64: res_block: -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] -; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; X64-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] -; X64-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq10( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64-LABEL: @cmp_eq10( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] -; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64: res_block: -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* -; X64-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4 -; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 -; X64-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] -; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] -; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq11( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq12( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64-LABEL: @cmp_eq12( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] -; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64: res_block: -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* -; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 -; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 -; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] -; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] -; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq13( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq14( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq15( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq16( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64-LABEL: @cmp_eq16( -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i128* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i128* -; X64-NEXT: [[TMP3:%.*]] = load i128, i128* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i128, i128* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = icmp ne i128 [[TMP3]], [[TMP4]] -; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - diff --git a/test/Transforms/ExpandMemCmp/X86/lit.local.cfg b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg new file mode 100644 index 00000000000..e71f3cc4c41 --- /dev/null +++ b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'X86' in config.root.targets: + config.unsupported = True + diff --git a/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/test/Transforms/ExpandMemCmp/X86/memcmp.ll new file mode 100644 index 00000000000..1abfb20f369 --- /dev/null +++ b/test/Transforms/ExpandMemCmp/X86/memcmp.ll @@ -0,0 +1,792 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -expandmemcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32 +; RUN: opt -S -expandmemcmp -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 + +declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) + +define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp2( +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; ALL-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]]) +; ALL-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 +; ALL-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32 +; ALL-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] +; ALL-NEXT: ret i32 [[TMP9]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) + ret i32 %call +} + +define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp3( +; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL: res_block: +; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i16 [ [[TMP7:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i16 [ [[TMP8:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[TMP1:%.*]] = icmp ult i16 [[PHI_SRC1]], [[PHI_SRC2]] +; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]] +; ALL-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP4]] +; ALL-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]]) +; ALL-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]]) +; ALL-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] +; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; ALL-NEXT: br label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; ALL-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) + ret i32 %call +} + +define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp4( +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; ALL-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) +; ALL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] +; ALL-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 +; ALL-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 +; ALL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; ALL-NEXT: ret i32 [[TMP11]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) + ret i32 %call +} + +define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp5( +; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL: res_block: +; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] +; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] +; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] +; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; ALL-NEXT: br label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; ALL-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) + ret i32 %call +} + +define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp6( +; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL: res_block: +; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] +; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] +; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] +; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 +; ALL-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2 +; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; ALL-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] +; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; ALL-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +; ALL-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 +; ALL-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i32 +; ALL-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]] +; ALL-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; ALL-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) + ret i32 %call +} + +define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp7( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) +; ALL-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) + ret i32 %call +} + +define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp8( +; X32-NEXT: br label [[LOADBB:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb: +; X32-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] +; X32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] +; X32-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; X32-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1 +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) +; X32-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp8( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]]) +; X64-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]] +; X64-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]] +; X64-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 +; X64-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 +; X64-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; X64-NEXT: ret i32 [[TMP11]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) + ret i32 %call +} + +define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp9( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp9( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] +; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X64-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) + ret i32 %call +} + +define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp10( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp10( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 +; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 4 +; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64 +; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64 +; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]] +; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) + ret i32 %call +} + +define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp11( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) +; ALL-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) + ret i32 %call +} + +define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp12( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp12( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 2 +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) +; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 +; X64-NEXT: [[TMP19]] = zext i32 [[TMP17]] to i64 +; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]] +; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) + ret i32 %call +} + +define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp13( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) +; ALL-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) + ret i32 %call +} + +define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp14( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) +; ALL-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) + ret i32 %call +} + +define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp15( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) +; ALL-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) + ret i32 %call +} + +define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp16( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp16( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i64* +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i64* +; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1 +; X64-NEXT: [[TMP13:%.*]] = getelementptr i64, i64* [[TMP11]], i64 1 +; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]] +; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) +; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]]) +; X64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]] +; X64-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) + ret i32 %call +} + +define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq2( +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] +; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq3( +; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] +; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] +; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq4( +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq5( +; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] +; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq6( +; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 +; ALL-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 +; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; ALL-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] +; ALL-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] +; ALL-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq7( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq8( +; X32-NEXT: br label [[LOADBB:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb: +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; X32-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; X32-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 1 +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] +; X32-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] +; X32-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq8( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq9( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq9( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; X64-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] +; X64-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq10( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq10( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 +; X64-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4 +; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] +; X64-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] +; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq11( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq12( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq12( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2 +; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] +; X64-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] +; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq13( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq14( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq15( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq16( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq16( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i128* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i128* +; X64-NEXT: [[TMP3:%.*]] = load i128, i128* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i128, i128* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i128 [[TMP3]], [[TMP4]] +; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + -- cgit v1.2.3 From c0222867301e7d88ec925dea7d306468ff3ea172 Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Thu, 2 Nov 2017 15:53:10 +0000 Subject: Revert "[ExpandMemCmp] Split ExpandMemCmp from CodeGen into its own pass." undefined reference to `llvm::TargetPassConfig::ID' on clang-ppc64le-linux-multistage This reverts commit eea333c33fa73ad225ef28607795984829f65688. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317213 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/InitializePasses.h | 1 - include/llvm/LinkAllPasses.h | 1 - include/llvm/Transforms/Scalar.h | 8 +- lib/CodeGen/CodeGenPrepare.cpp | 710 +++++++++++++++++++++ lib/CodeGen/TargetPassConfig.cpp | 10 +- lib/Transforms/Scalar/CMakeLists.txt | 1 - lib/Transforms/Scalar/ExpandMemCmp.cpp | 828 ------------------------- lib/Transforms/Scalar/Scalar.cpp | 1 - test/CodeGen/Generic/llc-start-stop.ll | 6 +- test/CodeGen/X86/memcmp-optsize.ll | 224 +++---- test/CodeGen/X86/memcmp.ll | 240 +++---- test/Transforms/CodeGenPrepare/X86/memcmp.ll | 771 +++++++++++++++++++++++ test/Transforms/ExpandMemCmp/X86/lit.local.cfg | 3 - test/Transforms/ExpandMemCmp/X86/memcmp.ll | 792 ----------------------- 14 files changed, 1722 insertions(+), 1874 deletions(-) delete mode 100644 lib/Transforms/Scalar/ExpandMemCmp.cpp create mode 100644 test/Transforms/CodeGenPrepare/X86/memcmp.ll delete mode 100644 test/Transforms/ExpandMemCmp/X86/lit.local.cfg delete mode 100644 test/Transforms/ExpandMemCmp/X86/memcmp.ll diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 67a077081f7..c3ad8fe41af 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -128,7 +128,6 @@ void initializeEdgeBundlesPass(PassRegistry&); void initializeEfficiencySanitizerPass(PassRegistry&); void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&); void initializeExpandISelPseudosPass(PassRegistry&); -void initializeExpandMemCmpPassPass(PassRegistry&); void initializeExpandPostRAPass(PassRegistry&); void initializeExpandReductionsPass(PassRegistry&); void initializeExternalAAWrapperPassPass(PassRegistry&); diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index ce70f53ccb0..765e63926da 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -180,7 +180,6 @@ namespace { (void) llvm::createReversePostOrderFunctionAttrsPass(); (void) llvm::createMergeFunctionsPass(); (void) llvm::createMergeICmpsPass(); - (void) llvm::createExpandMemCmpPass(); std::string buf; llvm::raw_string_ostream os(buf); (void) llvm::createPrintModulePass(os); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index 4b365858787..8ef65774a93 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -422,16 +422,10 @@ Pass *createLowerGuardIntrinsicPass(); //===----------------------------------------------------------------------===// // -// MergeICmps - Merge integer comparison chains into a memcmp +// MergeICmps - Merge integer comparison chains // Pass *createMergeICmpsPass(); -//===----------------------------------------------------------------------===// -// -// ExpandMemCmp - Expand memcmp() to load/stores. -// -Pass *createExpandMemCmpPass(); - //===----------------------------------------------------------------------===// // // ValuePropagation - Propagate CFG-derived value information diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 973ddebd987..51f2a320b29 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -123,6 +123,12 @@ STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); +STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); +STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); +STATISTIC(NumMemCmpGreaterThanMax, + "Number of memcmp calls with size greater than max size"); +STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); + static cl::opt DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); @@ -183,6 +189,11 @@ EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, cl::desc("Enable merging of redundant sexts when one is dominating" " the other."), cl::init(true)); +static cl::opt MemCmpNumLoadsPerBlock( + "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), + cl::desc("The number of loads per basic block for inline expansion of " + "memcmp that is only being compared against zero.")); + namespace { using SetOfInstrs = SmallPtrSet; @@ -1686,6 +1697,699 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, return true; } +namespace { + +// This class provides helper functions to expand a memcmp library call into an +// inline expansion. +class MemCmpExpansion { + struct ResultBlock { + BasicBlock *BB = nullptr; + PHINode *PhiSrc1 = nullptr; + PHINode *PhiSrc2 = nullptr; + + ResultBlock() = default; + }; + + CallInst *const CI; + ResultBlock ResBlock; + const uint64_t Size; + unsigned MaxLoadSize; + uint64_t NumLoadsNonOneByte; + const uint64_t NumLoadsPerBlock; + std::vector LoadCmpBlocks; + BasicBlock *EndBlock; + PHINode *PhiRes; + const bool IsUsedForZeroCmp; + const DataLayout &DL; + IRBuilder<> Builder; + // Represents the decomposition in blocks of the expansion. For example, + // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and + // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}. + // TODO(courbet): Involve the target more in this computation. On X86, 7 + // bytes can be done more efficiently with two overlaping 4-byte loads than + // covering the interval with [{4, 0},{2, 4},{1, 6}}. + struct LoadEntry { + LoadEntry(unsigned LoadSize, uint64_t Offset) + : LoadSize(LoadSize), Offset(Offset) { + assert(Offset % LoadSize == 0 && "invalid load entry"); + } + + uint64_t getGEPIndex() const { return Offset / LoadSize; } + + // The size of the load for this block, in bytes. + const unsigned LoadSize; + // The offset of this load WRT the base pointer, in bytes. + const uint64_t Offset; + }; + SmallVector LoadSequence; + + void createLoadCmpBlocks(); + void createResultBlock(); + void setupResultBlockPHINodes(); + void setupEndBlockPHINodes(); + Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex); + void emitLoadCompareBlock(unsigned BlockIndex); + void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, + unsigned &LoadIndex); + void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex); + void emitMemCmpResultBlock(); + Value *getMemCmpExpansionZeroCase(); + Value *getMemCmpEqZeroOneBlock(); + Value *getMemCmpOneBlock(); + + public: + MemCmpExpansion(CallInst *CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + unsigned NumLoadsPerBlock, const DataLayout &DL); + + unsigned getNumBlocks(); + uint64_t getNumLoads() const { return LoadSequence.size(); } + + Value *getMemCmpExpansion(); +}; + +} // end anonymous namespace + +// Initialize the basic block structure required for expansion of memcmp call +// with given maximum load size and memcmp size parameter. +// This structure includes: +// 1. A list of load compare blocks - LoadCmpBlocks. +// 2. An EndBlock, split from original instruction point, which is the block to +// return from. +// 3. ResultBlock, block to branch to for early exit when a +// LoadCmpBlock finds a difference. +MemCmpExpansion::MemCmpExpansion( + CallInst *const CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + const unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout) + : CI(CI), + Size(Size), + MaxLoadSize(0), + NumLoadsNonOneByte(0), + NumLoadsPerBlock(NumLoadsPerBlock), + IsUsedForZeroCmp(IsUsedForZeroCmp), + DL(TheDataLayout), + Builder(CI) { + assert(Size > 0 && "zero blocks"); + // Scale the max size down if the target can load more bytes than we need. + size_t LoadSizeIndex = 0; + while (LoadSizeIndex < Options.LoadSizes.size() && + Options.LoadSizes[LoadSizeIndex] > Size) { + ++LoadSizeIndex; + } + this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; + // Compute the decomposition. + uint64_t CurSize = Size; + uint64_t Offset = 0; + while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { + const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; + assert(LoadSize > 0 && "zero load size"); + const uint64_t NumLoadsForThisSize = CurSize / LoadSize; + if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { + // Do not expand if the total number of loads is larger than what the + // target allows. Note that it's important that we exit before completing + // the expansion to avoid using a ton of memory to store the expansion for + // large sizes. + LoadSequence.clear(); + return; + } + if (NumLoadsForThisSize > 0) { + for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { + LoadSequence.push_back({LoadSize, Offset}); + Offset += LoadSize; + } + if (LoadSize > 1) { + ++NumLoadsNonOneByte; + } + CurSize = CurSize % LoadSize; + } + ++LoadSizeIndex; + } + assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); +} + +unsigned MemCmpExpansion::getNumBlocks() { + if (IsUsedForZeroCmp) + return getNumLoads() / NumLoadsPerBlock + + (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0); + return getNumLoads(); +} + +void MemCmpExpansion::createLoadCmpBlocks() { + for (unsigned i = 0; i < getNumBlocks(); i++) { + BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", + EndBlock->getParent(), EndBlock); + LoadCmpBlocks.push_back(BB); + } +} + +void MemCmpExpansion::createResultBlock() { + ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", + EndBlock->getParent(), EndBlock); +} + +// This function creates the IR instructions for loading and comparing 1 byte. +// It loads 1 byte from each source of the memcmp parameters with the given +// GEPIndex. It then subtracts the two loaded values and adds this result to the +// final phi node for selecting the memcmp result. +void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, + unsigned GEPIndex) { + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex. + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); + Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + + PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]); + + if (BlockIndex < (LoadCmpBlocks.size() - 1)) { + // Early exit branch if difference found to EndBlock. Otherwise, continue to + // next LoadCmpBlock, + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, + ConstantInt::get(Diff->getType(), 0)); + BranchInst *CmpBr = + BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp); + Builder.Insert(CmpBr); + } else { + // The last block has an unconditional branch to EndBlock. + BranchInst *CmpBr = BranchInst::Create(EndBlock); + Builder.Insert(CmpBr); + } +} + +/// Generate an equality comparison for one or more pairs of loaded values. +/// This is used in the case where the memcmp() call is compared equal or not +/// equal to zero. +Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex, + unsigned &LoadIndex) { + assert(LoadIndex < getNumLoads() && + "getCompareLoadPairs() called with no remaining loads"); + std::vector XorList, OrList; + Value *Diff; + + const unsigned NumLoads = + std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock); + + // For a single-block expansion, start inserting before the memcmp call. + if (LoadCmpBlocks.empty()) + Builder.SetInsertPoint(CI); + else + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + + Value *Cmp = nullptr; + // If we have multiple loads per block, we need to generate a composite + // comparison using xor+or. The type for the combinations is the largest load + // type. + IntegerType *const MaxLoadType = + NumLoads == 1 ? nullptr + : IntegerType::get(CI->getContext(), MaxLoadSize * 8); + for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) { + const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex]; + + IntegerType *LoadSizeType = + IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using a GEP. + if (CurLoadEntry.Offset != 0) { + Source1 = Builder.CreateGEP( + LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + Source2 = Builder.CreateGEP( + LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + } + + // Get a constant or load a value for each source address. + Value *LoadSrc1 = nullptr; + if (auto *Source1C = dyn_cast(Source1)) + LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL); + if (!LoadSrc1) + LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + + Value *LoadSrc2 = nullptr; + if (auto *Source2C = dyn_cast(Source2)) + LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL); + if (!LoadSrc2) + LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (NumLoads != 1) { + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); + } + // If we have multiple loads per block, we need to generate a composite + // comparison using xor+or. + Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); + Diff = Builder.CreateZExt(Diff, MaxLoadType); + XorList.push_back(Diff); + } else { + // If there's only one load per block, we just compare the loaded values. + Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); + } + } + + auto pairWiseOr = [&](std::vector &InList) -> std::vector { + std::vector OutList; + for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { + Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); + OutList.push_back(Or); + } + if (InList.size() % 2 != 0) + OutList.push_back(InList.back()); + return OutList; + }; + + if (!Cmp) { + // Pairwise OR the XOR results. + OrList = pairWiseOr(XorList); + + // Pairwise OR the OR results until one result left. + while (OrList.size() != 1) { + OrList = pairWiseOr(OrList); + } + Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); + } + + return Cmp; +} + +void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, + unsigned &LoadIndex) { + Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex); + + BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[BlockIndex + 1]; + // Early exit branch if difference found to ResultBlock. Otherwise, + // continue to next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes). + if (BlockIndex == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); + } +} + +// This function creates the IR intructions for loading and comparing using the +// given LoadSize. It loads the number of bytes specified by LoadSize from each +// source of the memcmp parameters. It then does a subtract to see if there was +// a difference in the loaded values. If a difference is found, it branches +// with an early exit to the ResultBlock for calculating which source was +// larger. Otherwise, it falls through to the either the next LoadCmpBlock or +// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with +// a special case through emitLoadCompareByteBlock. The special handling can +// simply subtract the loaded values and add it to the result phi node. +void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { + // There is one load per block in this case, BlockIndex == LoadIndex. + const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex]; + + if (CurLoadEntry.LoadSize == 1) { + MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, + CurLoadEntry.getGEPIndex()); + return; + } + + Type *LoadSizeType = + IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type"); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using a GEP. + if (CurLoadEntry.Offset != 0) { + Source1 = Builder.CreateGEP( + LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + Source2 = Builder.CreateGEP( + LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + } + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian()) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); + } + + // Add the loaded values to the phi nodes for calculating memcmp result only + // if result is not used in a zero equality. + if (!IsUsedForZeroCmp) { + ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]); + ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]); + } + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2); + BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[BlockIndex + 1]; + // Early exit branch if difference found to ResultBlock. Otherwise, continue + // to next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes). + if (BlockIndex == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); + } +} + +// This function populates the ResultBlock with a sequence to calculate the +// memcmp result. It compares the two loaded source values and returns -1 if +// src1 < src2 and 1 if src1 > src2. +void MemCmpExpansion::emitMemCmpResultBlock() { + // Special case: if memcmp result is used in a zero equality, result does not + // need to be calculated and can simply return 1. + if (IsUsedForZeroCmp) { + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); + PhiRes->addIncoming(Res, ResBlock.BB); + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + return; + } + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, + ResBlock.PhiSrc2); + + Value *Res = + Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), + ConstantInt::get(Builder.getInt32Ty(), 1)); + + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + PhiRes->addIncoming(Res, ResBlock.BB); +} + +void MemCmpExpansion::setupResultBlockPHINodes() { + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + Builder.SetInsertPoint(ResBlock.BB); + // Note: this assumes one load per block. + ResBlock.PhiSrc1 = + Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1"); + ResBlock.PhiSrc2 = + Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2"); +} + +void MemCmpExpansion::setupEndBlockPHINodes() { + Builder.SetInsertPoint(&EndBlock->front()); + PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); +} + +Value *MemCmpExpansion::getMemCmpExpansionZeroCase() { + unsigned LoadIndex = 0; + // This loop populates each of the LoadCmpBlocks with the IR sequence to + // handle multiple loads per block. + for (unsigned I = 0; I < getNumBlocks(); ++I) { + emitLoadCompareBlockMultipleLoads(I, LoadIndex); + } + + emitMemCmpResultBlock(); + return PhiRes; +} + +/// A memcmp expansion that compares equality with 0 and only has one block of +/// load and compare can bypass the compare, branch, and phi IR that is required +/// in the general case. +Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() { + unsigned LoadIndex = 0; + Value *Cmp = getCompareLoadPairs(0, LoadIndex); + assert(LoadIndex == getNumLoads() && "some entries were not consumed"); + return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); +} + +/// A memcmp expansion that only has one block of load and compare can bypass +/// the compare, branch, and phi IR that is required in the general case. +Value *MemCmpExpansion::getMemCmpOneBlock() { + assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); + + Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian() && Size != 1) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (Size < 4) { + // The i8 and i16 cases don't need compares. We zext the loaded values and + // subtract them to get the suitable negative, zero, or positive i32 result. + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty()); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty()); + return Builder.CreateSub(LoadSrc1, LoadSrc2); + } + + // The result of memcmp is negative, zero, or positive, so produce that by + // subtracting 2 extended compare bits: sub (ugt, ult). + // If a target prefers to use selects to get -1/0/1, they should be able + // to transform this later. The inverse transform (going from selects to math) + // may not be possible in the DAG because the selects got converted into + // branches before we got there. + Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); + Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); + Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); + Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); + return Builder.CreateSub(ZextUGT, ZextULT); +} + +// This function expands the memcmp call into an inline expansion and returns +// the memcmp result. +Value *MemCmpExpansion::getMemCmpExpansion() { + // A memcmp with zero-comparison with only one block of load and compare does + // not need to set up any extra blocks. This case could be handled in the DAG, + // but since we have all of the machinery to flexibly expand any memcpy here, + // we choose to handle this case too to avoid fragmented lowering. + if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) { + BasicBlock *StartBlock = CI->getParent(); + EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + setupEndBlockPHINodes(); + createResultBlock(); + + // If return value of memcmp is not used in a zero equality, we need to + // calculate which source was larger. The calculation requires the + // two loaded source values of each load compare block. + // These will be saved in the phi nodes created by setupResultBlockPHINodes. + if (!IsUsedForZeroCmp) setupResultBlockPHINodes(); + + // Create the number of required load compare basic blocks. + createLoadCmpBlocks(); + + // Update the terminator added by splitBasicBlock to branch to the first + // LoadCmpBlock. + StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); + } + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + if (IsUsedForZeroCmp) + return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock() + : getMemCmpExpansionZeroCase(); + + // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). + if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock(); + + for (unsigned I = 0; I < getNumBlocks(); ++I) { + emitLoadCompareBlock(I); + } + + emitMemCmpResultBlock(); + return PhiRes; +} + +// This function checks to see if an expansion of memcmp can be generated. +// It checks for constant compare size that is less than the max inline size. +// If an expansion cannot occur, returns false to leave as a library call. +// Otherwise, the library call is replaced with a new IR instruction sequence. +/// We want to transform: +/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) +/// To: +/// loadbb: +/// %0 = bitcast i32* %buffer2 to i8* +/// %1 = bitcast i32* %buffer1 to i8* +/// %2 = bitcast i8* %1 to i64* +/// %3 = bitcast i8* %0 to i64* +/// %4 = load i64, i64* %2 +/// %5 = load i64, i64* %3 +/// %6 = call i64 @llvm.bswap.i64(i64 %4) +/// %7 = call i64 @llvm.bswap.i64(i64 %5) +/// %8 = sub i64 %6, %7 +/// %9 = icmp ne i64 %8, 0 +/// br i1 %9, label %res_block, label %loadbb1 +/// res_block: ; preds = %loadbb2, +/// %loadbb1, %loadbb +/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] +/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] +/// %10 = icmp ult i64 %phi.src1, %phi.src2 +/// %11 = select i1 %10, i32 -1, i32 1 +/// br label %endblock +/// loadbb1: ; preds = %loadbb +/// %12 = bitcast i32* %buffer2 to i8* +/// %13 = bitcast i32* %buffer1 to i8* +/// %14 = bitcast i8* %13 to i32* +/// %15 = bitcast i8* %12 to i32* +/// %16 = getelementptr i32, i32* %14, i32 2 +/// %17 = getelementptr i32, i32* %15, i32 2 +/// %18 = load i32, i32* %16 +/// %19 = load i32, i32* %17 +/// %20 = call i32 @llvm.bswap.i32(i32 %18) +/// %21 = call i32 @llvm.bswap.i32(i32 %19) +/// %22 = zext i32 %20 to i64 +/// %23 = zext i32 %21 to i64 +/// %24 = sub i64 %22, %23 +/// %25 = icmp ne i64 %24, 0 +/// br i1 %25, label %res_block, label %loadbb2 +/// loadbb2: ; preds = %loadbb1 +/// %26 = bitcast i32* %buffer2 to i8* +/// %27 = bitcast i32* %buffer1 to i8* +/// %28 = bitcast i8* %27 to i16* +/// %29 = bitcast i8* %26 to i16* +/// %30 = getelementptr i16, i16* %28, i16 6 +/// %31 = getelementptr i16, i16* %29, i16 6 +/// %32 = load i16, i16* %30 +/// %33 = load i16, i16* %31 +/// %34 = call i16 @llvm.bswap.i16(i16 %32) +/// %35 = call i16 @llvm.bswap.i16(i16 %33) +/// %36 = zext i16 %34 to i64 +/// %37 = zext i16 %35 to i64 +/// %38 = sub i64 %36, %37 +/// %39 = icmp ne i64 %38, 0 +/// br i1 %39, label %res_block, label %loadbb3 +/// loadbb3: ; preds = %loadbb2 +/// %40 = bitcast i32* %buffer2 to i8* +/// %41 = bitcast i32* %buffer1 to i8* +/// %42 = getelementptr i8, i8* %41, i8 14 +/// %43 = getelementptr i8, i8* %40, i8 14 +/// %44 = load i8, i8* %42 +/// %45 = load i8, i8* %43 +/// %46 = zext i8 %44 to i32 +/// %47 = zext i8 %45 to i32 +/// %48 = sub i32 %46, %47 +/// br label %endblock +/// endblock: ; preds = %res_block, +/// %loadbb3 +/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] +/// ret i32 %phi.res +static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, + const TargetLowering *TLI, const DataLayout *DL) { + NumMemCmpCalls++; + + // Early exit from expansion if -Oz. + if (CI->getFunction()->optForMinSize()) + return false; + + // Early exit from expansion if size is not a constant. + ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); + if (!SizeCast) { + NumMemCmpNotConstant++; + return false; + } + const uint64_t SizeVal = SizeCast->getZExtValue(); + + if (SizeVal == 0) { + return false; + } + + // TTI call to check if target would like to expand memcmp. Also, get the + // available load sizes. + const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); + const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp); + if (!Options) return false; + + const unsigned MaxNumLoads = + TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize()); + + MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads, + IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL); + + // Don't expand if this will require more loads than desired by the target. + if (Expansion.getNumLoads() == 0) { + NumMemCmpGreaterThanMax++; + return false; + } + + NumMemCmpInlined++; + + Value *Res = Expansion.getMemCmpExpansion(); + + // Replace call with result of expansion and erase call. + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + + return true; +} + bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { BasicBlock *BB = CI->getParent(); @@ -1838,6 +2542,12 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { return true; } + LibFunc Func; + if (TLInfo->getLibFunc(ImmutableCallSite(CI), Func) && + Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TLI, DL)) { + ModifiedDT = true; + return true; + } return false; } diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index 59e88ba3bda..c5101b1ecfc 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -600,14 +600,8 @@ void TargetPassConfig::addIRPasses() { addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n")); } - if (getOptLevel() != CodeGenOpt::None) { - // The MergeICmpsPass tries to create memcmp calls by grouping sequences of - // loads and compares. ExpandMemCmpPass then tries to expand those calls - // into optimally-sized loads and compares. The transforms are enabled by a - // target lowering hook. - if (EnableMergeICmps) - addPass(createMergeICmpsPass()); - addPass(createExpandMemCmpPass()); + if (getOptLevel() != CodeGenOpt::None && EnableMergeICmps) { + addPass(createMergeICmpsPass()); } // Run GC lowering passes for builtin collectors diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index 164163d2131..d79ae851005 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -9,7 +9,6 @@ add_llvm_library(LLVMScalarOpts DeadStoreElimination.cpp DivRemPairs.cpp EarlyCSE.cpp - ExpandMemCmp.cpp FlattenCFGPass.cpp Float2Int.cpp GuardWidening.cpp diff --git a/lib/Transforms/Scalar/ExpandMemCmp.cpp b/lib/Transforms/Scalar/ExpandMemCmp.cpp deleted file mode 100644 index 0cd8c11422f..00000000000 --- a/lib/Transforms/Scalar/ExpandMemCmp.cpp +++ /dev/null @@ -1,828 +0,0 @@ -//===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass tries to partially inline the fast path of well-known library -// functions, such as using square-root instructions for cases where sqrt() -// does not need to set errno. -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetSubtargetInfo.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" - -using namespace llvm; - -#define DEBUG_TYPE "expandmemcmp" - -STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); -STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); -STATISTIC(NumMemCmpGreaterThanMax, - "Number of memcmp calls with size greater than max size"); -STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); - -static cl::opt MemCmpNumLoadsPerBlock( - "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), - cl::desc("The number of loads per basic block for inline expansion of " - "memcmp that is only being compared against zero.")); - -namespace { - - -// This class provides helper functions to expand a memcmp library call into an -// inline expansion. -class MemCmpExpansion { - struct ResultBlock { - BasicBlock *BB = nullptr; - PHINode *PhiSrc1 = nullptr; - PHINode *PhiSrc2 = nullptr; - - ResultBlock() = default; - }; - - CallInst *const CI; - ResultBlock ResBlock; - const uint64_t Size; - unsigned MaxLoadSize; - uint64_t NumLoadsNonOneByte; - const uint64_t NumLoadsPerBlock; - std::vector LoadCmpBlocks; - BasicBlock *EndBlock; - PHINode *PhiRes; - const bool IsUsedForZeroCmp; - const DataLayout &DL; - IRBuilder<> Builder; - // Represents the decomposition in blocks of the expansion. For example, - // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and - // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}. - // TODO(courbet): Involve the target more in this computation. On X86, 7 - // bytes can be done more efficiently with two overlaping 4-byte loads than - // covering the interval with [{4, 0},{2, 4},{1, 6}}. - struct LoadEntry { - LoadEntry(unsigned LoadSize, uint64_t Offset) - : LoadSize(LoadSize), Offset(Offset) { - assert(Offset % LoadSize == 0 && "invalid load entry"); - } - - uint64_t getGEPIndex() const { return Offset / LoadSize; } - - // The size of the load for this block, in bytes. - const unsigned LoadSize; - // The offset of this load WRT the base pointer, in bytes. - const uint64_t Offset; - }; - SmallVector LoadSequence; - - void createLoadCmpBlocks(); - void createResultBlock(); - void setupResultBlockPHINodes(); - void setupEndBlockPHINodes(); - Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex); - void emitLoadCompareBlock(unsigned BlockIndex); - void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, - unsigned &LoadIndex); - void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex); - void emitMemCmpResultBlock(); - Value *getMemCmpExpansionZeroCase(); - Value *getMemCmpEqZeroOneBlock(); - Value *getMemCmpOneBlock(); - - public: - MemCmpExpansion(CallInst *CI, uint64_t Size, - const TargetTransformInfo::MemCmpExpansionOptions &Options, - unsigned MaxNumLoads, const bool IsUsedForZeroCmp, - unsigned NumLoadsPerBlock, const DataLayout &DL); - - unsigned getNumBlocks(); - uint64_t getNumLoads() const { return LoadSequence.size(); } - - Value *getMemCmpExpansion(); -}; - -// Initialize the basic block structure required for expansion of memcmp call -// with given maximum load size and memcmp size parameter. -// This structure includes: -// 1. A list of load compare blocks - LoadCmpBlocks. -// 2. An EndBlock, split from original instruction point, which is the block to -// return from. -// 3. ResultBlock, block to branch to for early exit when a -// LoadCmpBlock finds a difference. -MemCmpExpansion::MemCmpExpansion( - CallInst *const CI, uint64_t Size, - const TargetTransformInfo::MemCmpExpansionOptions &Options, - const unsigned MaxNumLoads, const bool IsUsedForZeroCmp, - const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout) - : CI(CI), - Size(Size), - MaxLoadSize(0), - NumLoadsNonOneByte(0), - NumLoadsPerBlock(NumLoadsPerBlock), - IsUsedForZeroCmp(IsUsedForZeroCmp), - DL(TheDataLayout), - Builder(CI) { - assert(Size > 0 && "zero blocks"); - // Scale the max size down if the target can load more bytes than we need. - size_t LoadSizeIndex = 0; - while (LoadSizeIndex < Options.LoadSizes.size() && - Options.LoadSizes[LoadSizeIndex] > Size) { - ++LoadSizeIndex; - } - this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; - // Compute the decomposition. - uint64_t CurSize = Size; - uint64_t Offset = 0; - while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { - const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; - assert(LoadSize > 0 && "zero load size"); - const uint64_t NumLoadsForThisSize = CurSize / LoadSize; - if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { - // Do not expand if the total number of loads is larger than what the - // target allows. Note that it's important that we exit before completing - // the expansion to avoid using a ton of memory to store the expansion for - // large sizes. - LoadSequence.clear(); - return; - } - if (NumLoadsForThisSize > 0) { - for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { - LoadSequence.push_back({LoadSize, Offset}); - Offset += LoadSize; - } - if (LoadSize > 1) { - ++NumLoadsNonOneByte; - } - CurSize = CurSize % LoadSize; - } - ++LoadSizeIndex; - } - assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); -} - -unsigned MemCmpExpansion::getNumBlocks() { - if (IsUsedForZeroCmp) - return getNumLoads() / NumLoadsPerBlock + - (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0); - return getNumLoads(); -} - -void MemCmpExpansion::createLoadCmpBlocks() { - for (unsigned i = 0; i < getNumBlocks(); i++) { - BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", - EndBlock->getParent(), EndBlock); - LoadCmpBlocks.push_back(BB); - } -} - -void MemCmpExpansion::createResultBlock() { - ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", - EndBlock->getParent(), EndBlock); -} - -// This function creates the IR instructions for loading and comparing 1 byte. -// It loads 1 byte from each source of the memcmp parameters with the given -// GEPIndex. It then subtracts the two loaded values and adds this result to the -// final phi node for selecting the memcmp result. -void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, - unsigned GEPIndex) { - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using the GEPIndex. - if (GEPIndex != 0) { - Source1 = Builder.CreateGEP(LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, GEPIndex)); - Source2 = Builder.CreateGEP(LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, GEPIndex)); - } - - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); - Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); - - PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]); - - if (BlockIndex < (LoadCmpBlocks.size() - 1)) { - // Early exit branch if difference found to EndBlock. Otherwise, continue to - // next LoadCmpBlock, - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, - ConstantInt::get(Diff->getType(), 0)); - BranchInst *CmpBr = - BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp); - Builder.Insert(CmpBr); - } else { - // The last block has an unconditional branch to EndBlock. - BranchInst *CmpBr = BranchInst::Create(EndBlock); - Builder.Insert(CmpBr); - } -} - -/// Generate an equality comparison for one or more pairs of loaded values. -/// This is used in the case where the memcmp() call is compared equal or not -/// equal to zero. -Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex, - unsigned &LoadIndex) { - assert(LoadIndex < getNumLoads() && - "getCompareLoadPairs() called with no remaining loads"); - std::vector XorList, OrList; - Value *Diff; - - const unsigned NumLoads = - std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock); - - // For a single-block expansion, start inserting before the memcmp call. - if (LoadCmpBlocks.empty()) - Builder.SetInsertPoint(CI); - else - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - - Value *Cmp = nullptr; - // If we have multiple loads per block, we need to generate a composite - // comparison using xor+or. The type for the combinations is the largest load - // type. - IntegerType *const MaxLoadType = - NumLoads == 1 ? nullptr - : IntegerType::get(CI->getContext(), MaxLoadSize * 8); - for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) { - const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex]; - - IntegerType *LoadSizeType = - IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); - - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using a GEP. - if (CurLoadEntry.Offset != 0) { - Source1 = Builder.CreateGEP( - LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - Source2 = Builder.CreateGEP( - LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - } - - // Get a constant or load a value for each source address. - Value *LoadSrc1 = nullptr; - if (auto *Source1C = dyn_cast(Source1)) - LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL); - if (!LoadSrc1) - LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - - Value *LoadSrc2 = nullptr; - if (auto *Source2C = dyn_cast(Source2)) - LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL); - if (!LoadSrc2) - LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (NumLoads != 1) { - if (LoadSizeType != MaxLoadType) { - LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); - } - // If we have multiple loads per block, we need to generate a composite - // comparison using xor+or. - Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); - Diff = Builder.CreateZExt(Diff, MaxLoadType); - XorList.push_back(Diff); - } else { - // If there's only one load per block, we just compare the loaded values. - Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); - } - } - - auto pairWiseOr = [&](std::vector &InList) -> std::vector { - std::vector OutList; - for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { - Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); - OutList.push_back(Or); - } - if (InList.size() % 2 != 0) - OutList.push_back(InList.back()); - return OutList; - }; - - if (!Cmp) { - // Pairwise OR the XOR results. - OrList = pairWiseOr(XorList); - - // Pairwise OR the OR results until one result left. - while (OrList.size() != 1) { - OrList = pairWiseOr(OrList); - } - Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); - } - - return Cmp; -} - -void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, - unsigned &LoadIndex) { - Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex); - - BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) - ? EndBlock - : LoadCmpBlocks[BlockIndex + 1]; - // Early exit branch if difference found to ResultBlock. Otherwise, - // continue to next LoadCmpBlock or EndBlock. - BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); - Builder.Insert(CmpBr); - - // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 - // since early exit to ResultBlock was not taken (no difference was found in - // any of the bytes). - if (BlockIndex == LoadCmpBlocks.size() - 1) { - Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); - PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); - } -} - -// This function creates the IR intructions for loading and comparing using the -// given LoadSize. It loads the number of bytes specified by LoadSize from each -// source of the memcmp parameters. It then does a subtract to see if there was -// a difference in the loaded values. If a difference is found, it branches -// with an early exit to the ResultBlock for calculating which source was -// larger. Otherwise, it falls through to the either the next LoadCmpBlock or -// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with -// a special case through emitLoadCompareByteBlock. The special handling can -// simply subtract the loaded values and add it to the result phi node. -void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { - // There is one load per block in this case, BlockIndex == LoadIndex. - const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex]; - - if (CurLoadEntry.LoadSize == 1) { - MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, - CurLoadEntry.getGEPIndex()); - return; - } - - Type *LoadSizeType = - IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); - Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); - assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type"); - - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using a GEP. - if (CurLoadEntry.Offset != 0) { - Source1 = Builder.CreateGEP( - LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - Source2 = Builder.CreateGEP( - LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - } - - // Load LoadSizeType from the base address. - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (DL.isLittleEndian()) { - Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::bswap, LoadSizeType); - LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); - LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); - } - - if (LoadSizeType != MaxLoadType) { - LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); - } - - // Add the loaded values to the phi nodes for calculating memcmp result only - // if result is not used in a zero equality. - if (!IsUsedForZeroCmp) { - ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]); - ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]); - } - - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2); - BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) - ? EndBlock - : LoadCmpBlocks[BlockIndex + 1]; - // Early exit branch if difference found to ResultBlock. Otherwise, continue - // to next LoadCmpBlock or EndBlock. - BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); - Builder.Insert(CmpBr); - - // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 - // since early exit to ResultBlock was not taken (no difference was found in - // any of the bytes). - if (BlockIndex == LoadCmpBlocks.size() - 1) { - Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); - PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); - } -} - -// This function populates the ResultBlock with a sequence to calculate the -// memcmp result. It compares the two loaded source values and returns -1 if -// src1 < src2 and 1 if src1 > src2. -void MemCmpExpansion::emitMemCmpResultBlock() { - // Special case: if memcmp result is used in a zero equality, result does not - // need to be calculated and can simply return 1. - if (IsUsedForZeroCmp) { - BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); - Builder.SetInsertPoint(ResBlock.BB, InsertPt); - Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); - PhiRes->addIncoming(Res, ResBlock.BB); - BranchInst *NewBr = BranchInst::Create(EndBlock); - Builder.Insert(NewBr); - return; - } - BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); - Builder.SetInsertPoint(ResBlock.BB, InsertPt); - - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, - ResBlock.PhiSrc2); - - Value *Res = - Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), - ConstantInt::get(Builder.getInt32Ty(), 1)); - - BranchInst *NewBr = BranchInst::Create(EndBlock); - Builder.Insert(NewBr); - PhiRes->addIncoming(Res, ResBlock.BB); -} - -void MemCmpExpansion::setupResultBlockPHINodes() { - Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); - Builder.SetInsertPoint(ResBlock.BB); - // Note: this assumes one load per block. - ResBlock.PhiSrc1 = - Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1"); - ResBlock.PhiSrc2 = - Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2"); -} - -void MemCmpExpansion::setupEndBlockPHINodes() { - Builder.SetInsertPoint(&EndBlock->front()); - PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); -} - -Value *MemCmpExpansion::getMemCmpExpansionZeroCase() { - unsigned LoadIndex = 0; - // This loop populates each of the LoadCmpBlocks with the IR sequence to - // handle multiple loads per block. - for (unsigned I = 0; I < getNumBlocks(); ++I) { - emitLoadCompareBlockMultipleLoads(I, LoadIndex); - } - - emitMemCmpResultBlock(); - return PhiRes; -} - -/// A memcmp expansion that compares equality with 0 and only has one block of -/// load and compare can bypass the compare, branch, and phi IR that is required -/// in the general case. -Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() { - unsigned LoadIndex = 0; - Value *Cmp = getCompareLoadPairs(0, LoadIndex); - assert(LoadIndex == getNumLoads() && "some entries were not consumed"); - return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); -} - -/// A memcmp expansion that only has one block of load and compare can bypass -/// the compare, branch, and phi IR that is required in the general case. -Value *MemCmpExpansion::getMemCmpOneBlock() { - assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); - - Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Load LoadSizeType from the base address. - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (DL.isLittleEndian() && Size != 1) { - Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::bswap, LoadSizeType); - LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); - LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); - } - - if (Size < 4) { - // The i8 and i16 cases don't need compares. We zext the loaded values and - // subtract them to get the suitable negative, zero, or positive i32 result. - LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty()); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty()); - return Builder.CreateSub(LoadSrc1, LoadSrc2); - } - - // The result of memcmp is negative, zero, or positive, so produce that by - // subtracting 2 extended compare bits: sub (ugt, ult). - // If a target prefers to use selects to get -1/0/1, they should be able - // to transform this later. The inverse transform (going from selects to math) - // may not be possible in the DAG because the selects got converted into - // branches before we got there. - Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); - Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); - Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); - Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); - return Builder.CreateSub(ZextUGT, ZextULT); -} - -// This function expands the memcmp call into an inline expansion and returns -// the memcmp result. -Value *MemCmpExpansion::getMemCmpExpansion() { - // A memcmp with zero-comparison with only one block of load and compare does - // not need to set up any extra blocks. This case could be handled in the DAG, - // but since we have all of the machinery to flexibly expand any memcpy here, - // we choose to handle this case too to avoid fragmented lowering. - if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) { - BasicBlock *StartBlock = CI->getParent(); - EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); - setupEndBlockPHINodes(); - createResultBlock(); - - // If return value of memcmp is not used in a zero equality, we need to - // calculate which source was larger. The calculation requires the - // two loaded source values of each load compare block. - // These will be saved in the phi nodes created by setupResultBlockPHINodes. - if (!IsUsedForZeroCmp) setupResultBlockPHINodes(); - - // Create the number of required load compare basic blocks. - createLoadCmpBlocks(); - - // Update the terminator added by splitBasicBlock to branch to the first - // LoadCmpBlock. - StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); - } - - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - if (IsUsedForZeroCmp) - return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock() - : getMemCmpExpansionZeroCase(); - - // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). - if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock(); - - for (unsigned I = 0; I < getNumBlocks(); ++I) { - emitLoadCompareBlock(I); - } - - emitMemCmpResultBlock(); - return PhiRes; -} - -// This function checks to see if an expansion of memcmp can be generated. -// It checks for constant compare size that is less than the max inline size. -// If an expansion cannot occur, returns false to leave as a library call. -// Otherwise, the library call is replaced with a new IR instruction sequence. -/// We want to transform: -/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) -/// To: -/// loadbb: -/// %0 = bitcast i32* %buffer2 to i8* -/// %1 = bitcast i32* %buffer1 to i8* -/// %2 = bitcast i8* %1 to i64* -/// %3 = bitcast i8* %0 to i64* -/// %4 = load i64, i64* %2 -/// %5 = load i64, i64* %3 -/// %6 = call i64 @llvm.bswap.i64(i64 %4) -/// %7 = call i64 @llvm.bswap.i64(i64 %5) -/// %8 = sub i64 %6, %7 -/// %9 = icmp ne i64 %8, 0 -/// br i1 %9, label %res_block, label %loadbb1 -/// res_block: ; preds = %loadbb2, -/// %loadbb1, %loadbb -/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] -/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] -/// %10 = icmp ult i64 %phi.src1, %phi.src2 -/// %11 = select i1 %10, i32 -1, i32 1 -/// br label %endblock -/// loadbb1: ; preds = %loadbb -/// %12 = bitcast i32* %buffer2 to i8* -/// %13 = bitcast i32* %buffer1 to i8* -/// %14 = bitcast i8* %13 to i32* -/// %15 = bitcast i8* %12 to i32* -/// %16 = getelementptr i32, i32* %14, i32 2 -/// %17 = getelementptr i32, i32* %15, i32 2 -/// %18 = load i32, i32* %16 -/// %19 = load i32, i32* %17 -/// %20 = call i32 @llvm.bswap.i32(i32 %18) -/// %21 = call i32 @llvm.bswap.i32(i32 %19) -/// %22 = zext i32 %20 to i64 -/// %23 = zext i32 %21 to i64 -/// %24 = sub i64 %22, %23 -/// %25 = icmp ne i64 %24, 0 -/// br i1 %25, label %res_block, label %loadbb2 -/// loadbb2: ; preds = %loadbb1 -/// %26 = bitcast i32* %buffer2 to i8* -/// %27 = bitcast i32* %buffer1 to i8* -/// %28 = bitcast i8* %27 to i16* -/// %29 = bitcast i8* %26 to i16* -/// %30 = getelementptr i16, i16* %28, i16 6 -/// %31 = getelementptr i16, i16* %29, i16 6 -/// %32 = load i16, i16* %30 -/// %33 = load i16, i16* %31 -/// %34 = call i16 @llvm.bswap.i16(i16 %32) -/// %35 = call i16 @llvm.bswap.i16(i16 %33) -/// %36 = zext i16 %34 to i64 -/// %37 = zext i16 %35 to i64 -/// %38 = sub i64 %36, %37 -/// %39 = icmp ne i64 %38, 0 -/// br i1 %39, label %res_block, label %loadbb3 -/// loadbb3: ; preds = %loadbb2 -/// %40 = bitcast i32* %buffer2 to i8* -/// %41 = bitcast i32* %buffer1 to i8* -/// %42 = getelementptr i8, i8* %41, i8 14 -/// %43 = getelementptr i8, i8* %40, i8 14 -/// %44 = load i8, i8* %42 -/// %45 = load i8, i8* %43 -/// %46 = zext i8 %44 to i32 -/// %47 = zext i8 %45 to i32 -/// %48 = sub i32 %46, %47 -/// br label %endblock -/// endblock: ; preds = %res_block, -/// %loadbb3 -/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] -/// ret i32 %phi.res -static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, - const TargetLowering *TLI, const DataLayout *DL) { - NumMemCmpCalls++; - - // Early exit from expansion if -Oz. - if (CI->getFunction()->optForMinSize()) - return false; - - // Early exit from expansion if size is not a constant. - ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); - if (!SizeCast) { - NumMemCmpNotConstant++; - return false; - } - const uint64_t SizeVal = SizeCast->getZExtValue(); - - if (SizeVal == 0) { - return false; - } - - // TTI call to check if target would like to expand memcmp. Also, get the - // available load sizes. - const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); - const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp); - if (!Options) return false; - - const unsigned MaxNumLoads = - TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize()); - - MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads, - IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL); - - // Don't expand if this will require more loads than desired by the target. - if (Expansion.getNumLoads() == 0) { - NumMemCmpGreaterThanMax++; - return false; - } - - NumMemCmpInlined++; - - Value *Res = Expansion.getMemCmpExpansion(); - - // Replace call with result of expansion and erase call. - CI->replaceAllUsesWith(Res); - CI->eraseFromParent(); - - return true; -} - - - -class ExpandMemCmpPass : public FunctionPass { -public: - static char ID; - - ExpandMemCmpPass() : FunctionPass(ID) { - initializeExpandMemCmpPassPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override { - if (skipFunction(F)) return false; - - auto *TPC = getAnalysisIfAvailable(); - if (!TPC) { - return false; - } - const TargetLowering* TL = - TPC->getTM().getSubtargetImpl(F)->getTargetLowering(); - - const TargetLibraryInfo *TLI = - &getAnalysis().getTLI(); - const TargetTransformInfo *TTI = - &getAnalysis().getTTI(F); - auto PA = runImpl(F, TLI, TTI, TL); - return !PA.areAllPreserved(); - } - -private: - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - FunctionPass::getAnalysisUsage(AU); - } - - PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, - const TargetLowering* TL); - // Returns true if a change was made. - bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, const TargetLowering* TL, - const DataLayout& DL); -}; - -bool ExpandMemCmpPass::runOnBlock( - BasicBlock &BB, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, const TargetLowering* TL, - const DataLayout& DL) { - for (Instruction& I : BB) { - CallInst *CI = dyn_cast(&I); - if (!CI) { - continue; - } - LibFunc Func; - if (TLI->getLibFunc(ImmutableCallSite(CI), Func) && - Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TL, &DL)) { - return true; - } - } - return false; -} - - -PreservedAnalyses ExpandMemCmpPass::runImpl( - Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, - const TargetLowering* TL) { - const DataLayout& DL = F.getParent()->getDataLayout(); - bool MadeChanges = false; - for (auto BBIt = F.begin(); BBIt != F.end();) { - if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) { - MadeChanges = true; - // If changes were made, restart the function from the beginning, since - // the structure of the function was changed. - BBIt = F.begin(); - } else { - ++BBIt; - } - } - return MadeChanges ? PreservedAnalyses::none() : PreservedAnalyses::all(); -} - -} // namespace - -char ExpandMemCmpPass::ID = 0; -INITIALIZE_PASS_BEGIN(ExpandMemCmpPass, "expandmemcmp", - "Expand memcmp() to load/stores", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp", - "Expand memcmp() to load/stores", false, false) - -Pass *llvm::createExpandMemCmpPass() { - return new ExpandMemCmpPass(); -} diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 4b694cecea6..c1034ace206 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -48,7 +48,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeNewGVNLegacyPassPass(Registry); initializeEarlyCSELegacyPassPass(Registry); initializeEarlyCSEMemSSALegacyPassPass(Registry); - initializeExpandMemCmpPassPass(Registry); initializeGVNHoistLegacyPassPass(Registry); initializeGVNSinkLegacyPassPass(Registry); initializeFlattenCFGPassPass(Registry); diff --git a/test/CodeGen/Generic/llc-start-stop.ll b/test/CodeGen/Generic/llc-start-stop.ll index 9056e2cab49..85b69c37aa0 100644 --- a/test/CodeGen/Generic/llc-start-stop.ll +++ b/test/CodeGen/Generic/llc-start-stop.ll @@ -13,15 +13,15 @@ ; STOP-BEFORE-NOT: Loop Strength Reduction ; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER -; START-AFTER: -machine-branch-prob -expandmemcmp +; START-AFTER: -machine-branch-prob -gc-lowering ; START-AFTER: FunctionPass Manager -; START-AFTER-NEXT: Expand memcmp() to load/stores +; START-AFTER-NEXT: Lower Garbage Collection Instructions ; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE ; START-BEFORE: -machine-branch-prob -domtree ; START-BEFORE: FunctionPass Manager ; START-BEFORE: Loop Strength Reduction -; START-BEFORE-NEXT: Expand memcmp() to load/stores +; START-BEFORE-NEXT: Lower Garbage Collection Instructions ; RUN: not llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE ; RUN: not llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll index 3f5eeba7055..77d9fa69182 100644 --- a/test/CodeGen/X86/memcmp-optsize.ll +++ b/test/CodeGen/X86/memcmp-optsize.ll @@ -156,36 +156,36 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize { define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length3_eq: -; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %edx -; X86-NEXT: cmpw (%eax), %dx -; X86-NEXT: jne .LBB5_2 -; X86-NEXT: # BB#1: # %loadbb1 -; X86-NEXT: movb 2(%ecx), %dl -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb 2(%eax), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: cmpw (%ecx), %dx +; X86-NEXT: jne .LBB5_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 2(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 2(%ecx), %dl ; X86-NEXT: je .LBB5_3 -; X86-NEXT: .LBB5_2: # %res_block -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: incl %ecx +; X86-NEXT: .LBB5_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax ; X86-NEXT: .LBB5_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB5_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB5_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movb 2(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 2(%rsi), %cl ; X64-NEXT: je .LBB5_3 -; X64-NEXT: .LBB5_2: # %res_block +; X64-NEXT: .LBB5_1: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB5_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -314,36 +314,36 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize { define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length5_eq: -; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: cmpl (%eax), %edx -; X86-NEXT: jne .LBB10_2 -; X86-NEXT: # BB#1: # %loadbb1 -; X86-NEXT: movb 4(%ecx), %dl -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb 4(%eax), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB10_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 4(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 4(%ecx), %dl ; X86-NEXT: je .LBB10_3 -; X86-NEXT: .LBB10_2: # %res_block -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: incl %ecx +; X86-NEXT: .LBB10_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax ; X86-NEXT: .LBB10_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB10_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB10_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movb 4(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 4(%rsi), %cl ; X64-NEXT: je .LBB10_3 -; X64-NEXT: .LBB10_2: # %res_block +; X64-NEXT: .LBB10_1: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB10_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -356,7 +356,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8: -; X86: # BB#0: +; X86: # BB#0: # %loadbb ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -365,8 +365,8 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB11_2 -; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: jne .LBB11_1 +; X86-NEXT: # BB#2: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx ; X86-NEXT: bswapl %ecx @@ -374,7 +374,7 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: je .LBB11_3 -; X86-NEXT: .LBB11_2: # %res_block +; X86-NEXT: .LBB11_1: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al @@ -400,22 +400,22 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8_eq: -; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: cmpl (%eax), %edx -; X86-NEXT: jne .LBB12_2 -; X86-NEXT: # BB#1: # %loadbb1 -; X86-NEXT: movl 4(%ecx), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpl 4(%eax), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB12_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl 4(%ecx), %edx ; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_2: # %res_block -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: incl %ecx +; X86-NEXT: .LBB12_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax ; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -432,15 +432,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq_const(i8* %X) nounwind optsize { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB13_2 -; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: jne .LBB13_1 +; X86-NEXT: # BB#2: # %loadbb1 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 ; X86-NEXT: je .LBB13_3 -; X86-NEXT: .LBB13_2: # %res_block +; X86-NEXT: .LBB13_1: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: incl %eax ; X86-NEXT: .LBB13_3: # %endblock @@ -473,16 +473,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB14_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB14_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl 8(%rsi), %ecx ; X64-NEXT: je .LBB14_3 -; X64-NEXT: .LBB14_2: # %res_block +; X64-NEXT: .LBB14_1: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB14_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -505,27 +505,28 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB15_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB15_3 -; X64-NEXT: .LBB15_2: # %res_block +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB15_1: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: .LBB15_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m @@ -545,27 +546,28 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB16_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB16_3 -; X64-NEXT: .LBB16_2: # %res_block +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB16_1: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m @@ -699,19 +701,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB20_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB20_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx ; X64-SSE2-NEXT: je .LBB20_3 -; X64-SSE2-NEXT: .LBB20_2: # %res_block +; X64-SSE2-NEXT: .LBB20_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB20_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -719,18 +721,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq: -; X64-AVX2: # BB#0: +; X64-AVX2: # BB#0: # %loadbb ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB20_2 -; X64-AVX2-NEXT: # BB#1: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB20_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 ; X64-AVX2-NEXT: movq 16(%rdi), %rcx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx ; X64-AVX2-NEXT: je .LBB20_3 -; X64-AVX2-NEXT: .LBB20_2: # %res_block +; X64-AVX2-NEXT: .LBB20_1: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB20_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -755,18 +757,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB21_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB21_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) ; X64-SSE2-NEXT: je .LBB21_3 -; X64-SSE2-NEXT: .LBB21_2: # %res_block +; X64-SSE2-NEXT: .LBB21_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB21_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -774,18 +776,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq_const: -; X64-AVX2: # BB#0: +; X64-AVX2: # BB#0: # %loadbb ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB21_2 -; X64-AVX2-NEXT: # BB#1: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB21_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi) ; X64-AVX2-NEXT: je .LBB21_3 -; X64-AVX2-NEXT: .LBB21_2: # %res_block +; X64-AVX2-NEXT: .LBB21_1: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB21_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -831,7 +833,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-NOSSE-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # BB#0: +; X86-SSE2: # BB#0: # %loadbb ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 @@ -839,8 +841,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB23_2 -; X86-SSE2-NEXT: # BB#1: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB23_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -848,7 +850,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB23_3 -; X86-SSE2-NEXT: .LBB23_2: # %res_block +; X86-SSE2-NEXT: .LBB23_1: # %res_block ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: incl %eax ; X86-SSE2-NEXT: .LBB23_3: # %endblock @@ -857,14 +859,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB23_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -872,7 +874,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_2: # %res_block +; X64-SSE2-NEXT: .LBB23_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -907,21 +909,21 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize { ; X86-NOSSE-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # BB#0: +; X86-SSE2: # BB#0: # %loadbb ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB24_2 -; X86-SSE2-NEXT: # BB#1: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB24_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB24_3 -; X86-SSE2-NEXT: .LBB24_2: # %res_block +; X86-SSE2-NEXT: .LBB24_1: # %res_block ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: incl %eax ; X86-SSE2-NEXT: .LBB24_3: # %endblock @@ -930,20 +932,20 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB24_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB24_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB24_3 -; X64-SSE2-NEXT: .LBB24_2: # %res_block +; X64-SSE2-NEXT: .LBB24_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB24_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -1007,20 +1009,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq: -; X64-AVX2: # BB#0: +; X64-AVX2: # BB#0: # %loadbb ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB26_2 -; X64-AVX2-NEXT: # BB#1: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB26_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB26_3 -; X64-AVX2-NEXT: .LBB26_2: # %res_block +; X64-AVX2-NEXT: .LBB26_1: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB26_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -1057,20 +1059,20 @@ define i1 @length64_eq_const(i8* %X) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq_const: -; X64-AVX2: # BB#0: +; X64-AVX2: # BB#0: # %loadbb ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB27_2 -; X64-AVX2-NEXT: # BB#1: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB27_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB27_3 -; X64-AVX2-NEXT: .LBB27_2: # %res_block +; X64-AVX2-NEXT: .LBB27_1: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB27_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index 84fd45b0a08..393e4c42d8b 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -187,35 +187,35 @@ define i32 @length3(i8* %X, i8* %Y) nounwind { define i1 @length3_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length3_eq: -; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %edx -; X86-NEXT: cmpw (%eax), %dx -; X86-NEXT: jne .LBB7_2 -; X86-NEXT: # BB#1: # %loadbb1 -; X86-NEXT: movb 2(%ecx), %dl -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb 2(%eax), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: cmpw (%ecx), %dx +; X86-NEXT: jne .LBB7_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 2(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 2(%ecx), %dl ; X86-NEXT: je .LBB7_3 -; X86-NEXT: .LBB7_2: # %res_block -; X86-NEXT: movl $1, %ecx +; X86-NEXT: .LBB7_1: # %res_block +; X86-NEXT: movl $1, %eax ; X86-NEXT: .LBB7_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB7_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB7_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movb 2(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 2(%rsi), %cl ; X64-NEXT: je .LBB7_3 -; X64-NEXT: .LBB7_2: # %res_block +; X64-NEXT: .LBB7_1: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB7_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -344,35 +344,35 @@ define i32 @length5(i8* %X, i8* %Y) nounwind { define i1 @length5_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length5_eq: -; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: cmpl (%eax), %edx -; X86-NEXT: jne .LBB12_2 -; X86-NEXT: # BB#1: # %loadbb1 -; X86-NEXT: movb 4(%ecx), %dl -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb 4(%eax), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB12_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 4(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 4(%ecx), %dl ; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_2: # %res_block -; X86-NEXT: movl $1, %ecx +; X86-NEXT: .LBB12_1: # %res_block +; X86-NEXT: movl $1, %eax ; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB12_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB12_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movb 4(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 4(%rsi), %cl ; X64-NEXT: je .LBB12_3 -; X64-NEXT: .LBB12_2: # %res_block +; X64-NEXT: .LBB12_1: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB12_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -385,7 +385,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind { define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8: -; X86: # BB#0: +; X86: # BB#0: # %loadbb ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -394,21 +394,23 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB13_2 -; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: jne .LBB13_1 +; X86-NEXT: # BB#2: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: je .LBB13_3 -; X86-NEXT: .LBB13_2: # %res_block +; X86-NEXT: jne .LBB13_1 +; X86-NEXT: # BB#3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB13_1: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al ; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB13_3: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -429,21 +431,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { define i1 @length8_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8_eq: -; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: cmpl (%eax), %edx -; X86-NEXT: jne .LBB14_2 -; X86-NEXT: # BB#1: # %loadbb1 -; X86-NEXT: movl 4(%ecx), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpl 4(%eax), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB14_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl 4(%ecx), %edx ; X86-NEXT: je .LBB14_3 -; X86-NEXT: .LBB14_2: # %res_block -; X86-NEXT: movl $1, %ecx +; X86-NEXT: .LBB14_1: # %res_block +; X86-NEXT: movl $1, %eax ; X86-NEXT: .LBB14_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -460,15 +462,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind { define i1 @length8_eq_const(i8* %X) nounwind { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB15_2 -; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: jne .LBB15_1 +; X86-NEXT: # BB#2: # %loadbb1 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 ; X86-NEXT: je .LBB15_3 -; X86-NEXT: .LBB15_2: # %res_block +; X86-NEXT: .LBB15_1: # %res_block ; X86-NEXT: movl $1, %eax ; X86-NEXT: .LBB15_3: # %endblock ; X86-NEXT: testl %eax, %eax @@ -500,16 +502,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB16_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl 8(%rsi), %ecx ; X64-NEXT: je .LBB16_3 -; X64-NEXT: .LBB16_2: # %res_block +; X64-NEXT: .LBB16_1: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -532,27 +534,28 @@ define i32 @length12(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB17_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB17_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB17_3 -; X64-NEXT: .LBB17_2: # %res_block +; X64-NEXT: jne .LBB17_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB17_1: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: .LBB17_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m @@ -572,27 +575,28 @@ define i32 @length16(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB18_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB18_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB18_3 -; X64-NEXT: .LBB18_2: # %res_block +; X64-NEXT: jne .LBB18_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB18_1: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: .LBB18_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m @@ -750,19 +754,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB22_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB22_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx ; X64-SSE2-NEXT: je .LBB22_3 -; X64-SSE2-NEXT: .LBB22_2: # %res_block +; X64-SSE2-NEXT: .LBB22_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB22_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -770,18 +774,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq: -; X64-AVX: # BB#0: +; X64-AVX: # BB#0: # %loadbb ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB22_2 -; X64-AVX-NEXT: # BB#1: # %loadbb1 +; X64-AVX-NEXT: jne .LBB22_1 +; X64-AVX-NEXT: # BB#2: # %loadbb1 ; X64-AVX-NEXT: movq 16(%rdi), %rcx ; X64-AVX-NEXT: xorl %eax, %eax ; X64-AVX-NEXT: cmpq 16(%rsi), %rcx ; X64-AVX-NEXT: je .LBB22_3 -; X64-AVX-NEXT: .LBB22_2: # %res_block +; X64-AVX-NEXT: .LBB22_1: # %res_block ; X64-AVX-NEXT: movl $1, %eax ; X64-AVX-NEXT: .LBB22_3: # %endblock ; X64-AVX-NEXT: testl %eax, %eax @@ -806,18 +810,18 @@ define i1 @length24_eq_const(i8* %X) nounwind { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB23_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) ; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_2: # %res_block +; X64-SSE2-NEXT: .LBB23_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -825,18 +829,18 @@ define i1 @length24_eq_const(i8* %X) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq_const: -; X64-AVX: # BB#0: +; X64-AVX: # BB#0: # %loadbb ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB23_2 -; X64-AVX-NEXT: # BB#1: # %loadbb1 +; X64-AVX-NEXT: jne .LBB23_1 +; X64-AVX-NEXT: # BB#2: # %loadbb1 ; X64-AVX-NEXT: xorl %eax, %eax ; X64-AVX-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-AVX-NEXT: cmpq %rcx, 16(%rdi) ; X64-AVX-NEXT: je .LBB23_3 -; X64-AVX-NEXT: .LBB23_2: # %res_block +; X64-AVX-NEXT: .LBB23_1: # %res_block ; X64-AVX-NEXT: movl $1, %eax ; X64-AVX-NEXT: .LBB23_3: # %endblock ; X64-AVX-NEXT: testl %eax, %eax @@ -894,7 +898,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # BB#0: +; X86-SSE2: # BB#0: # %loadbb ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 @@ -902,8 +906,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB25_2 -; X86-SSE2-NEXT: # BB#1: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB25_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -911,7 +915,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB25_3 -; X86-SSE2-NEXT: .LBB25_2: # %res_block +; X86-SSE2-NEXT: .LBB25_1: # %res_block ; X86-SSE2-NEXT: movl $1, %eax ; X86-SSE2-NEXT: .LBB25_3: # %endblock ; X86-SSE2-NEXT: testl %eax, %eax @@ -919,14 +923,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB25_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB25_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -934,7 +938,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB25_3 -; X64-SSE2-NEXT: .LBB25_2: # %res_block +; X64-SSE2-NEXT: .LBB25_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB25_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -942,20 +946,20 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq: -; X64-AVX1: # BB#0: +; X64-AVX1: # BB#0: # %loadbb ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB25_2 -; X64-AVX1-NEXT: # BB#1: # %loadbb1 +; X64-AVX1-NEXT: jne .LBB25_1 +; X64-AVX1-NEXT: # BB#2: # %loadbb1 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-AVX1-NEXT: je .LBB25_3 -; X64-AVX1-NEXT: .LBB25_2: # %res_block +; X64-AVX1-NEXT: .LBB25_1: # %res_block ; X64-AVX1-NEXT: movl $1, %eax ; X64-AVX1-NEXT: .LBB25_3: # %endblock ; X64-AVX1-NEXT: testl %eax, %eax @@ -1002,21 +1006,21 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # BB#0: +; X86-SSE2: # BB#0: # %loadbb ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB26_2 -; X86-SSE2-NEXT: # BB#1: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB26_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB26_3 -; X86-SSE2-NEXT: .LBB26_2: # %res_block +; X86-SSE2-NEXT: .LBB26_1: # %res_block ; X86-SSE2-NEXT: movl $1, %eax ; X86-SSE2-NEXT: .LBB26_3: # %endblock ; X86-SSE2-NEXT: testl %eax, %eax @@ -1024,20 +1028,20 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB26_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB26_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB26_3 -; X64-SSE2-NEXT: .LBB26_2: # %res_block +; X64-SSE2-NEXT: .LBB26_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB26_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -1045,20 +1049,20 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq_const: -; X64-AVX1: # BB#0: +; X64-AVX1: # BB#0: # %loadbb ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB26_2 -; X64-AVX1-NEXT: # BB#1: # %loadbb1 +; X64-AVX1-NEXT: jne .LBB26_1 +; X64-AVX1-NEXT: # BB#2: # %loadbb1 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-AVX1-NEXT: je .LBB26_3 -; X64-AVX1-NEXT: .LBB26_2: # %res_block +; X64-AVX1-NEXT: .LBB26_1: # %res_block ; X64-AVX1-NEXT: movl $1, %eax ; X64-AVX1-NEXT: .LBB26_3: # %endblock ; X64-AVX1-NEXT: testl %eax, %eax @@ -1132,20 +1136,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind { ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq: -; X64-AVX2: # BB#0: +; X64-AVX2: # BB#0: # %loadbb ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB28_2 -; X64-AVX2-NEXT: # BB#1: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB28_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB28_3 -; X64-AVX2-NEXT: .LBB28_2: # %res_block +; X64-AVX2-NEXT: .LBB28_1: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB28_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -1193,20 +1197,20 @@ define i1 @length64_eq_const(i8* %X) nounwind { ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq_const: -; X64-AVX2: # BB#0: +; X64-AVX2: # BB#0: # %loadbb ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB29_2 -; X64-AVX2-NEXT: # BB#1: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB29_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB29_3 -; X64-AVX2-NEXT: .LBB29_2: # %res_block +; X64-AVX2-NEXT: .LBB29_1: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB29_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/CodeGenPrepare/X86/memcmp.ll new file mode 100644 index 00000000000..a4f635c956d --- /dev/null +++ b/test/Transforms/CodeGenPrepare/X86/memcmp.ll @@ -0,0 +1,771 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -codegenprepare -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32 +; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 + +declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) + +define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp2( +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; ALL-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]]) +; ALL-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 +; ALL-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32 +; ALL-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] +; ALL-NEXT: ret i32 [[TMP9]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) + ret i32 %call +} + +define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp3( +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) +; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; ALL-NEXT: [[TMP6:%.*]] = icmp eq i16 [[TMP4]], [[TMP5]] +; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; ALL: res_block: +; ALL-NEXT: [[TMP7:%.*]] = icmp ult i16 [[TMP4]], [[TMP5]] +; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] +; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] +; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 +; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] +; ALL-NEXT: br label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; ALL-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) + ret i32 %call +} + +define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp4( +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; ALL-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) +; ALL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] +; ALL-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 +; ALL-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 +; ALL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; ALL-NEXT: ret i32 [[TMP11]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) + ret i32 %call +} + +define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp5( +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; ALL: res_block: +; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]] +; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] +; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] +; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 +; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] +; ALL-NEXT: br label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; ALL-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) + ret i32 %call +} + +define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp6( +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; ALL: res_block: +; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2 +; ALL-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 +; ALL-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; ALL-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; ALL-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i32 +; ALL-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 +; ALL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]] +; ALL-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; ALL-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) + ret i32 %call +} + +define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp7( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) +; ALL-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) + ret i32 %call +} + +define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp8( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp8( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]]) +; X64-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]] +; X64-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]] +; X64-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 +; X64-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 +; X64-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; X64-NEXT: ret i32 [[TMP11]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) + ret i32 %call +} + +define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp9( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp9( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] +; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] +; X64-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 +; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; X64-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) + ret i32 %call +} + +define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp10( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp10( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 +; X64-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) + ret i32 %call +} + +define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp11( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) +; ALL-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) + ret i32 %call +} + +define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp12( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp12( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) + ret i32 %call +} + +define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp13( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) +; ALL-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) + ret i32 %call +} + +define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp14( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) +; ALL-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) + ret i32 %call +} + +define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp15( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) +; ALL-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) + ret i32 %call +} + +define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp16( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp16( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i64* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i64* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 1 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1 +; X64-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] +; X64-NEXT: [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]]) +; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) +; X64-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP16]] +; X64-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) + ret i32 %call +} + +define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq2( +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] +; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq3( +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i16 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq4( +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq5( +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq6( +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2 +; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 +; ALL-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; ALL-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; ALL-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq7( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq8( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq8( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq9( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq9( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; X64-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq10( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq10( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 +; X64-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq11( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq12( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq12( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq13( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq14( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq15( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq16( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq16( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i128* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i128* +; X64-NEXT: [[TMP3:%.*]] = load i128, i128* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i128, i128* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i128 [[TMP3]], [[TMP4]] +; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + diff --git a/test/Transforms/ExpandMemCmp/X86/lit.local.cfg b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg deleted file mode 100644 index e71f3cc4c41..00000000000 --- a/test/Transforms/ExpandMemCmp/X86/lit.local.cfg +++ /dev/null @@ -1,3 +0,0 @@ -if not 'X86' in config.root.targets: - config.unsupported = True - diff --git a/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/test/Transforms/ExpandMemCmp/X86/memcmp.ll deleted file mode 100644 index 1abfb20f369..00000000000 --- a/test/Transforms/ExpandMemCmp/X86/memcmp.ll +++ /dev/null @@ -1,792 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -expandmemcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32 -; RUN: opt -S -expandmemcmp -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 - -declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) - -define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp2( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]]) -; ALL-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 -; ALL-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32 -; ALL-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] -; ALL-NEXT: ret i32 [[TMP9]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) - ret i32 %call -} - -define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp3( -; ALL-NEXT: br label [[LOADBB:%.*]] -; ALL: res_block: -; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i16 [ [[TMP7:%.*]], [[LOADBB]] ] -; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i16 [ [[TMP8:%.*]], [[LOADBB]] ] -; ALL-NEXT: [[TMP1:%.*]] = icmp ult i16 [[PHI_SRC1]], [[PHI_SRC2]] -; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]] -; ALL-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP4]] -; ALL-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]]) -; ALL-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]]) -; ALL-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 2 -; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 2 -; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] -; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 -; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] -; ALL-NEXT: br label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] -; ALL-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) - ret i32 %call -} - -define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp4( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) -; ALL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] -; ALL-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; ALL-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; ALL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] -; ALL-NEXT: ret i32 [[TMP11]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) - ret i32 %call -} - -define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp5( -; ALL-NEXT: br label [[LOADBB:%.*]] -; ALL: res_block: -; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ] -; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ] -; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] -; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] -; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) -; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) -; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] -; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 -; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] -; ALL-NEXT: br label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] -; ALL-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) - ret i32 %call -} - -define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp6( -; ALL-NEXT: br label [[LOADBB:%.*]] -; ALL: res_block: -; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] -; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] -; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] -; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] -; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) -; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) -; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* -; ALL-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* -; ALL-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 -; ALL-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2 -; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] -; ALL-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] -; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) -; ALL-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) -; ALL-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 -; ALL-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i32 -; ALL-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]] -; ALL-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] -; ALL-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) - ret i32 %call -} - -define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp7( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) -; ALL-NEXT: ret i32 [[CALL]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) - ret i32 %call -} - -define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp8( -; X32-NEXT: br label [[LOADBB:%.*]] -; X32: res_block: -; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] -; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; X32-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 -; X32-NEXT: br label [[ENDBLOCK:%.*]] -; X32: loadbb: -; X32-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] -; X32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] -; X32-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) -; X32-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) -; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] -; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] -; X32: loadbb1: -; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* -; X32-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* -; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 -; X32-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1 -; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] -; X32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] -; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) -; X32-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) -; X32-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]] -; X32-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; X32: endblock: -; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] -; X32-NEXT: ret i32 [[PHI_RES]] -; -; X64-LABEL: @cmp8( -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]]) -; X64-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]] -; X64-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]] -; X64-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; X64-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; X64-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] -; X64-NEXT: ret i32 [[TMP11]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) - ret i32 %call -} - -define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp9( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) -; X32-NEXT: ret i32 [[CALL]] -; -; X64-LABEL: @cmp9( -; X64-NEXT: br label [[LOADBB:%.*]] -; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ] -; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] -; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] -; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) -; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) -; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] -; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] -; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 -; X64-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] -; X64-NEXT: br label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] -; X64-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) - ret i32 %call -} - -define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp10( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) -; X32-NEXT: ret i32 [[CALL]] -; -; X64-LABEL: @cmp10( -; X64-NEXT: br label [[LOADBB:%.*]] -; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] -; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] -; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) -; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) -; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] -; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* -; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* -; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 -; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 4 -; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] -; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) -; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) -; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64 -; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64 -; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]] -; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] -; X64-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) - ret i32 %call -} - -define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp11( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; ALL-NEXT: ret i32 [[CALL]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) - ret i32 %call -} - -define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp12( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) -; X32-NEXT: ret i32 [[CALL]] -; -; X64-LABEL: @cmp12( -; X64-NEXT: br label [[LOADBB:%.*]] -; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] -; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] -; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) -; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) -; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] -; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* -; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* -; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 -; X64-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 2 -; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] -; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) -; X64-NEXT: [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) -; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 -; X64-NEXT: [[TMP19]] = zext i32 [[TMP17]] to i64 -; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]] -; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] -; X64-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) - ret i32 %call -} - -define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp13( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; ALL-NEXT: ret i32 [[CALL]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) - ret i32 %call -} - -define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp14( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; ALL-NEXT: ret i32 [[CALL]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) - ret i32 %call -} - -define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp15( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; ALL-NEXT: ret i32 [[CALL]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) - ret i32 %call -} - -define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp16( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) -; X32-NEXT: ret i32 [[CALL]] -; -; X64-LABEL: @cmp16( -; X64-NEXT: br label [[LOADBB:%.*]] -; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] -; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] -; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) -; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) -; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] -; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i64* -; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i64* -; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1 -; X64-NEXT: [[TMP13:%.*]] = getelementptr i64, i64* [[TMP11]], i64 1 -; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]] -; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) -; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]]) -; X64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]] -; X64-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] -; X64-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) - ret i32 %call -} - -define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq2( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] -; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq3( -; ALL-NEXT: br label [[LOADBB:%.*]] -; ALL: res_block: -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] -; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 2 -; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 2 -; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] -; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] -; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq4( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] -; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq5( -; ALL-NEXT: br label [[LOADBB:%.*]] -; ALL: res_block: -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] -; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] -; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] -; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq6( -; ALL-NEXT: br label [[LOADBB:%.*]] -; ALL: res_block: -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] -; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* -; ALL-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* -; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 -; ALL-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 -; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] -; ALL-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] -; ALL-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] -; ALL-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq7( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq8( -; X32-NEXT: br label [[LOADBB:%.*]] -; X32: res_block: -; X32-NEXT: br label [[ENDBLOCK:%.*]] -; X32: loadbb: -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; X32-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] -; X32-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X32: loadbb1: -; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* -; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* -; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 -; X32-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 1 -; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] -; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] -; X32-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] -; X32-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X32: endblock: -; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64-LABEL: @cmp_eq8( -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq9( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64-LABEL: @cmp_eq9( -; X64-NEXT: br label [[LOADBB:%.*]] -; X64: res_block: -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; X64-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] -; X64-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] -; X64-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq10( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64-LABEL: @cmp_eq10( -; X64-NEXT: br label [[LOADBB:%.*]] -; X64: res_block: -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* -; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* -; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 -; X64-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4 -; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] -; X64-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] -; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq11( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq12( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64-LABEL: @cmp_eq12( -; X64-NEXT: br label [[LOADBB:%.*]] -; X64: res_block: -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* -; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* -; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 -; X64-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2 -; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] -; X64-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] -; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq13( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq14( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq15( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq16( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64-LABEL: @cmp_eq16( -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i128* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i128* -; X64-NEXT: [[TMP3:%.*]] = load i128, i128* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i128, i128* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = icmp ne i128 [[TMP3]], [[TMP4]] -; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -- cgit v1.2.3 From 2880b72d32a55c01c386ebf5eca64df58ec32dae Mon Sep 17 00:00:00 2001 From: Anna Thomas Date: Thu, 2 Nov 2017 16:23:31 +0000 Subject: [RS4GC] Strip off invariant.start because memory locations arent invariant Summary: Invariant.start on memory locations has the property that the memory location is unchanging. However, this is not true in the face of rewriting statepoints for GC. Teach RS4GC about removing invariant.start so that optimizations after RS4GC does not incorrect sink a load from the memory location past a statepoint. Added test showcasing the issue. Reviewers: reames, apilipenko, dneilson Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D39388 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317215 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/RewriteStatepointsForGC.cpp | 48 ++++++++++++++++---- .../drop-invalid-metadata.ll | 53 ++++++++++++++++++++++ 2 files changed, 92 insertions(+), 9 deletions(-) diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 1ca77cfec32..9a064829dee 100644 --- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -125,10 +125,10 @@ struct RewriteStatepointsForGC : public ModulePass { Changed |= runOnFunction(F); if (Changed) { - // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn + // stripNonValidData asserts that shouldRewriteStatepointsIn // returns true for at least one function in the module. Since at least // one function changed, we know that the precondition is satisfied. - stripNonValidAttributesAndMetadata(M); + stripNonValidData(M); } return Changed; @@ -146,15 +146,17 @@ struct RewriteStatepointsForGC : public ModulePass { /// metadata implying dereferenceability that are no longer valid/correct after /// RewriteStatepointsForGC has run. This is because semantically, after /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire - /// heap. stripNonValidAttributesAndMetadata (conservatively) restores + /// heap. stripNonValidData (conservatively) restores /// correctness by erasing all attributes in the module that externally imply /// dereferenceability. Similar reasoning also applies to the noalias /// attributes and metadata. gc.statepoint can touch the entire heap including /// noalias objects. - void stripNonValidAttributesAndMetadata(Module &M); + /// Apart from attributes and metadata, we also remove instructions that imply + /// constant physical memory: llvm.invariant.start. + void stripNonValidData(Module &M); - // Helpers for stripNonValidAttributesAndMetadata - void stripNonValidAttributesAndMetadataFromBody(Function &F); + // Helpers for stripNonValidData + void stripNonValidDataFromBody(Function &F); void stripNonValidAttributesFromPrototype(Function &F); // Certain metadata on instructions are invalid after running RS4GC. @@ -2385,14 +2387,30 @@ void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC); } -void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) { +void RewriteStatepointsForGC::stripNonValidDataFromBody(Function &F) { if (F.empty()) return; LLVMContext &Ctx = F.getContext(); MDBuilder Builder(Ctx); + // Set of invariantstart instructions that we need to remove. + // Use this to avoid invalidating the instruction iterator. + SmallVector InvariantStartInstructions; + for (Instruction &I : instructions(F)) { + // invariant.start on memory location implies that the referenced memory + // location is constant and unchanging. This is no longer true after + // RewriteStatepointsForGC runs because there can be calls to gc.statepoint + // which frees the entire heap and the presence of invariant.start allows + // the optimizer to sink the load of a memory location past a statepoint, + // which is incorrect. + if (auto *II = dyn_cast(&I)) + if (II->getIntrinsicID() == Intrinsic::invariant_start) { + InvariantStartInstructions.push_back(II); + continue; + } + if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) { assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!"); bool IsImmutableTBAA = @@ -2422,6 +2440,18 @@ void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Functio RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex); } } + + // Delete the invariant.start instructions and any corresponding uses that + // don't have further uses, for example invariant.end. + for (auto *II : InvariantStartInstructions) { + for (auto *U : II->users()) + if (auto *I = dyn_cast(U)) + if (U->hasNUses(0)) + I->eraseFromParent(); + // We cannot just delete the remaining uses of II, so we RAUW undef. + II->replaceAllUsesWith(UndefValue::get(II->getType())); + II->eraseFromParent(); + } } /// Returns true if this function should be rewritten by this pass. The main @@ -2438,7 +2468,7 @@ static bool shouldRewriteStatepointsIn(Function &F) { return false; } -void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) { +void RewriteStatepointsForGC::stripNonValidData(Module &M) { #ifndef NDEBUG assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!"); #endif @@ -2447,7 +2477,7 @@ void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) { stripNonValidAttributesFromPrototype(F); for (Function &F : M) - stripNonValidAttributesAndMetadataFromBody(F); + stripNonValidDataFromBody(F); } bool RewriteStatepointsForGC::runOnFunction(Function &F) { diff --git a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll index 105afa9def5..4f3ab6a4beb 100644 --- a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll +++ b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll @@ -75,6 +75,59 @@ define void @test_dereferenceable(i32 addrspace(1)* addrspace(1)* %p, i32 %x, i3 ret void } +; invariant.start allows us to sink the load past the baz statepoint call into taken block, which is +; incorrect. remove the invariant.start and RAUW undef. +define void @test_inv_start(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" { +; CHECK-LABEL: test_inv_start +; CHECK-NOT: invariant.start +; CHECK: gc.statepoint + %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p + %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1) + %v2 = load i32, i32 addrspace(1)* %v1 + call void @baz(i32 %x) + br i1 %cond, label %taken, label %untaken + +; CHECK-LABEL: taken: +; CHECK-NOT: llvm.invariant.end +taken: + store i32 %v2, i32 addrspace(1)* %q, align 16 + call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1) + ret void + +; CHECK-LABEL: untaken: +; CHECK: gc.statepoint +untaken: + %foo = call i32 @escaping.invariant.start({}* %invst) + call void @dummy(i32 %foo) + ret void +} + +; invariant.start and end is removed. No other uses. +define void @test_inv_start2(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" { +; CHECK-LABEL: test_inv_start2 +; CHECK-NOT: invariant.start +; CHECK: gc.statepoint + %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p + %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1) + %v2 = load i32, i32 addrspace(1)* %v1 + call void @baz(i32 %x) + br i1 %cond, label %taken, label %untaken + +; CHECK-LABEL: taken: +; CHECK-NOT: llvm.invariant.end +taken: + store i32 %v2, i32 addrspace(1)* %q, align 16 + call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1) + ret void + +; CHECK-LABEL: untaken: +untaken: + ret void +} +declare {}* @llvm.invariant.start.p1i32(i64, i32 addrspace(1)* nocapture) nounwind readonly +declare void @llvm.invariant.end.p1i32({}*, i64, i32 addrspace(1)* nocapture) nounwind +declare i32 @escaping.invariant.start({}*) nounwind +declare void @dummy(i32) declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64, i32, void (i32)*, i32, i32, ...) ; Function Attrs: nounwind readonly -- cgit v1.2.3 From 685fd434908418296567408861b455e61af41ae7 Mon Sep 17 00:00:00 2001 From: Anna Thomas Date: Thu, 2 Nov 2017 16:45:51 +0000 Subject: Revert "[RS4GC] Strip off invariant.start because memory locations arent invariant" This reverts commit r317215, investigating the test failure. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317217 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/RewriteStatepointsForGC.cpp | 48 ++++---------------- .../drop-invalid-metadata.ll | 53 ---------------------- 2 files changed, 9 insertions(+), 92 deletions(-) diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 9a064829dee..1ca77cfec32 100644 --- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -125,10 +125,10 @@ struct RewriteStatepointsForGC : public ModulePass { Changed |= runOnFunction(F); if (Changed) { - // stripNonValidData asserts that shouldRewriteStatepointsIn + // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn // returns true for at least one function in the module. Since at least // one function changed, we know that the precondition is satisfied. - stripNonValidData(M); + stripNonValidAttributesAndMetadata(M); } return Changed; @@ -146,17 +146,15 @@ struct RewriteStatepointsForGC : public ModulePass { /// metadata implying dereferenceability that are no longer valid/correct after /// RewriteStatepointsForGC has run. This is because semantically, after /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire - /// heap. stripNonValidData (conservatively) restores + /// heap. stripNonValidAttributesAndMetadata (conservatively) restores /// correctness by erasing all attributes in the module that externally imply /// dereferenceability. Similar reasoning also applies to the noalias /// attributes and metadata. gc.statepoint can touch the entire heap including /// noalias objects. - /// Apart from attributes and metadata, we also remove instructions that imply - /// constant physical memory: llvm.invariant.start. - void stripNonValidData(Module &M); + void stripNonValidAttributesAndMetadata(Module &M); - // Helpers for stripNonValidData - void stripNonValidDataFromBody(Function &F); + // Helpers for stripNonValidAttributesAndMetadata + void stripNonValidAttributesAndMetadataFromBody(Function &F); void stripNonValidAttributesFromPrototype(Function &F); // Certain metadata on instructions are invalid after running RS4GC. @@ -2387,30 +2385,14 @@ void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC); } -void RewriteStatepointsForGC::stripNonValidDataFromBody(Function &F) { +void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) { if (F.empty()) return; LLVMContext &Ctx = F.getContext(); MDBuilder Builder(Ctx); - // Set of invariantstart instructions that we need to remove. - // Use this to avoid invalidating the instruction iterator. - SmallVector InvariantStartInstructions; - for (Instruction &I : instructions(F)) { - // invariant.start on memory location implies that the referenced memory - // location is constant and unchanging. This is no longer true after - // RewriteStatepointsForGC runs because there can be calls to gc.statepoint - // which frees the entire heap and the presence of invariant.start allows - // the optimizer to sink the load of a memory location past a statepoint, - // which is incorrect. - if (auto *II = dyn_cast(&I)) - if (II->getIntrinsicID() == Intrinsic::invariant_start) { - InvariantStartInstructions.push_back(II); - continue; - } - if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) { assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!"); bool IsImmutableTBAA = @@ -2440,18 +2422,6 @@ void RewriteStatepointsForGC::stripNonValidDataFromBody(Function &F) { RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex); } } - - // Delete the invariant.start instructions and any corresponding uses that - // don't have further uses, for example invariant.end. - for (auto *II : InvariantStartInstructions) { - for (auto *U : II->users()) - if (auto *I = dyn_cast(U)) - if (U->hasNUses(0)) - I->eraseFromParent(); - // We cannot just delete the remaining uses of II, so we RAUW undef. - II->replaceAllUsesWith(UndefValue::get(II->getType())); - II->eraseFromParent(); - } } /// Returns true if this function should be rewritten by this pass. The main @@ -2468,7 +2438,7 @@ static bool shouldRewriteStatepointsIn(Function &F) { return false; } -void RewriteStatepointsForGC::stripNonValidData(Module &M) { +void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) { #ifndef NDEBUG assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!"); #endif @@ -2477,7 +2447,7 @@ void RewriteStatepointsForGC::stripNonValidData(Module &M) { stripNonValidAttributesFromPrototype(F); for (Function &F : M) - stripNonValidDataFromBody(F); + stripNonValidAttributesAndMetadataFromBody(F); } bool RewriteStatepointsForGC::runOnFunction(Function &F) { diff --git a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll index 4f3ab6a4beb..105afa9def5 100644 --- a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll +++ b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll @@ -75,59 +75,6 @@ define void @test_dereferenceable(i32 addrspace(1)* addrspace(1)* %p, i32 %x, i3 ret void } -; invariant.start allows us to sink the load past the baz statepoint call into taken block, which is -; incorrect. remove the invariant.start and RAUW undef. -define void @test_inv_start(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" { -; CHECK-LABEL: test_inv_start -; CHECK-NOT: invariant.start -; CHECK: gc.statepoint - %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p - %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1) - %v2 = load i32, i32 addrspace(1)* %v1 - call void @baz(i32 %x) - br i1 %cond, label %taken, label %untaken - -; CHECK-LABEL: taken: -; CHECK-NOT: llvm.invariant.end -taken: - store i32 %v2, i32 addrspace(1)* %q, align 16 - call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1) - ret void - -; CHECK-LABEL: untaken: -; CHECK: gc.statepoint -untaken: - %foo = call i32 @escaping.invariant.start({}* %invst) - call void @dummy(i32 %foo) - ret void -} - -; invariant.start and end is removed. No other uses. -define void @test_inv_start2(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" { -; CHECK-LABEL: test_inv_start2 -; CHECK-NOT: invariant.start -; CHECK: gc.statepoint - %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p - %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1) - %v2 = load i32, i32 addrspace(1)* %v1 - call void @baz(i32 %x) - br i1 %cond, label %taken, label %untaken - -; CHECK-LABEL: taken: -; CHECK-NOT: llvm.invariant.end -taken: - store i32 %v2, i32 addrspace(1)* %q, align 16 - call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1) - ret void - -; CHECK-LABEL: untaken: -untaken: - ret void -} -declare {}* @llvm.invariant.start.p1i32(i64, i32 addrspace(1)* nocapture) nounwind readonly -declare void @llvm.invariant.end.p1i32({}*, i64, i32 addrspace(1)* nocapture) nounwind -declare i32 @escaping.invariant.start({}*) nounwind -declare void @dummy(i32) declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64, i32, void (i32)*, i32, i32, ...) ; Function Attrs: nounwind readonly -- cgit v1.2.3 From 9cca1f183a2e5d6b131cc2fabd246f8908021902 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 2 Nov 2017 17:12:34 +0000 Subject: [dsymutil] Add a manpage for dsymutil git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317221 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/CMakeLists.txt | 11 ++--- docs/CommandGuide/index.rst | 1 + docs/CommandGuide/llvm-dsymutil.rst | 86 ++++++++++++++++++++++++++++++++++++ docs/CommandGuide/llvm-dwarfdump.rst | 2 +- 4 files changed, 94 insertions(+), 6 deletions(-) create mode 100644 docs/CommandGuide/llvm-dsymutil.rst diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index f1f93c7a228..6e430459e5d 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -3,7 +3,7 @@ if (DOXYGEN_FOUND) if (LLVM_ENABLE_DOXYGEN) set(abs_top_srcdir ${CMAKE_CURRENT_SOURCE_DIR}) set(abs_top_builddir ${CMAKE_CURRENT_BINARY_DIR}) - + if (HAVE_DOT) set(DOT ${LLVM_PATH_DOT}) endif() @@ -21,20 +21,20 @@ if (LLVM_ENABLE_DOXYGEN) set(enable_external_search "NO") set(extra_search_mappings "") endif() - + # If asked, configure doxygen for the creation of a Qt Compressed Help file. option(LLVM_ENABLE_DOXYGEN_QT_HELP "Generate a Qt Compressed Help file." OFF) if (LLVM_ENABLE_DOXYGEN_QT_HELP) set(LLVM_DOXYGEN_QCH_FILENAME "org.llvm.qch" CACHE STRING "Filename of the Qt Compressed help file") - set(LLVM_DOXYGEN_QHP_NAMESPACE "org.llvm" CACHE STRING + set(LLVM_DOXYGEN_QHP_NAMESPACE "org.llvm" CACHE STRING "Namespace under which the intermediate Qt Help Project file lives") set(LLVM_DOXYGEN_QHP_CUST_FILTER_NAME "${PACKAGE_STRING}" CACHE STRING "See http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-filters") set(LLVM_DOXYGEN_QHP_CUST_FILTER_ATTRS "${PACKAGE_NAME},${PACKAGE_VERSION}" CACHE STRING "See http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes") - find_program(LLVM_DOXYGEN_QHELPGENERATOR_PATH qhelpgenerator + find_program(LLVM_DOXYGEN_QHELPGENERATOR_PATH qhelpgenerator DOC "Path to the qhelpgenerator binary") if (NOT LLVM_DOXYGEN_QHELPGENERATOR_PATH) message(FATAL_ERROR "Failed to find qhelpgenerator binary") @@ -55,7 +55,7 @@ if (LLVM_ENABLE_DOXYGEN) set(llvm_doxygen_qhp_cust_filter_name "") set(llvm_doxygen_qhp_cust_filter_attrs "") endif() - + option(LLVM_DOXYGEN_SVG "Use svg instead of png files for doxygen graphs." OFF) if (LLVM_DOXYGEN_SVG) @@ -113,6 +113,7 @@ if (LLVM_ENABLE_SPHINX) if (${SPHINX_OUTPUT_MAN}) add_sphinx_target(man llvm) add_sphinx_target(man llvm-dwarfdump) + add_sphinx_target(man llvm-dsymutil) endif() endif() diff --git a/docs/CommandGuide/index.rst b/docs/CommandGuide/index.rst index 5a0a98ceb1f..a706ba1d675 100644 --- a/docs/CommandGuide/index.rst +++ b/docs/CommandGuide/index.rst @@ -30,6 +30,7 @@ Basic Commands llvm-stress llvm-symbolizer llvm-dwarfdump + llvm-dsymutil Debugging Tools ~~~~~~~~~~~~~~~ diff --git a/docs/CommandGuide/llvm-dsymutil.rst b/docs/CommandGuide/llvm-dsymutil.rst new file mode 100644 index 00000000000..19340e194b8 --- /dev/null +++ b/docs/CommandGuide/llvm-dsymutil.rst @@ -0,0 +1,86 @@ +llvm-dsymutil - manipulate archived DWARF debug symbol files +============================================================ + +SYNOPSIS +-------- + +:program:`llvm-dsymutil` [*options*] [*filename*] + +DESCRIPTION +----------- + +:program:`llvm-dsymutil` links the DWARF debug information found in the object +files for the executable input file by using debug symbols information +contained in its symbol table. + +OPTIONS +------- +.. option:: -arch= + + Link DWARF debug information only for specified CPU architecture + types. This option can be specified multiple times, once for each + desired architecture. All cpu architectures will be linked by + default. + +.. option:: -dump-debug-map + + Parse and dump the debug map to standard output. Not DWARF link + will take place. + +.. option:: -f, -flat + + Produce a flat dSYM file (not a bundle). + +.. option:: -no-odr + + Do not use ODR (One Definition Rule) for type uniquing. + +.. option:: -no-output + + Do the link in memory, but do not emit the result file. + +.. option:: -no-swiftmodule-timestamp + + Don't check timestamp for swiftmodule files. + +.. option:: -j , -num-threads= + + Specifies the maximum number (n) of simultaneous threads to use + when linking multiple architectures. + +.. option:: -o= + + Specify the output file. default: .dwarf + +.. option:: -oso-prepend-path= + + Specify a directory to prepend to the paths of object files. + +.. option:: -s, -symtab + + Dumps the symbol table found in executable or object file(s) and + exits. + +.. option:: -v, -verbose + + Verbosity level + +.. option:: --version + + Display the version of the tool. + +.. option:: -y + + Treat the input file is a YAML debug map rather than a binary. + + +EXIT STATUS +----------- + +:program:`llvm-dsymutil` returns 0 if the DWARF debug information was linked +successfully. Otherwise, it returns 1. + +SEE ALSO +-------- + +:manpage:`llvm-dwarfdump(1)` diff --git a/docs/CommandGuide/llvm-dwarfdump.rst b/docs/CommandGuide/llvm-dwarfdump.rst index a3b62664cbe..4e7791573e6 100644 --- a/docs/CommandGuide/llvm-dwarfdump.rst +++ b/docs/CommandGuide/llvm-dwarfdump.rst @@ -139,4 +139,4 @@ successfully. Otherwise, it returns 1. SEE ALSO -------- -:manpage:`dsymutil(1)` +:manpage:`llvm-dsymutil(1)` -- cgit v1.2.3 From 0416327f19718d0834f85aa644a2572f67a94acb Mon Sep 17 00:00:00 2001 From: Chad Rosier Date: Thu, 2 Nov 2017 17:52:27 +0000 Subject: [TargetParser][AArch64] Reorder enum to preserve 5.0.0 libLLVM ABI. This is required for backporting r311659 to the 5.0.1 release. PR35060 Differential Revision: https://reviews.llvm.org/D39558 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317222 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/TargetParser.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/llvm/Support/TargetParser.h b/include/llvm/Support/TargetParser.h index 6b56a635ff0..b3f91433bd9 100644 --- a/include/llvm/Support/TargetParser.h +++ b/include/llvm/Support/TargetParser.h @@ -167,10 +167,10 @@ enum ArchExtKind : unsigned { AEK_PROFILE = 1 << 6, AEK_RAS = 1 << 7, AEK_LSE = 1 << 8, - AEK_RDM = 1 << 9, - AEK_SVE = 1 << 10, - AEK_DOTPROD = 1 << 11, - AEK_RCPC = 1 << 12 + AEK_SVE = 1 << 9, + AEK_DOTPROD = 1 << 10, + AEK_RCPC = 1 << 11, + AEK_RDM = 1 << 12 }; StringRef getCanonicalArchName(StringRef Arch); -- cgit v1.2.3 From 4c88213d82fbff2542c5aaa8ffb7b0d93c66b6cb Mon Sep 17 00:00:00 2001 From: Mitch Phillips Date: Thu, 2 Nov 2017 18:04:44 +0000 Subject: Fixed line length style issue. Reviewers: zturner Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D39395 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317223 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/MemoryBuffer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h index 73f0251a6b6..59c93f15d7b 100644 --- a/include/llvm/Support/MemoryBuffer.h +++ b/include/llvm/Support/MemoryBuffer.h @@ -136,7 +136,8 @@ public: /// Map a subrange of the specified file as a MemoryBuffer. static ErrorOr> - getFileSlice(const Twine &Filename, uint64_t MapSize, uint64_t Offset, bool IsVolatile = false); + getFileSlice(const Twine &Filename, uint64_t MapSize, uint64_t Offset, + bool IsVolatile = false); //===--------------------------------------------------------------------===// // Provided for performance analysis. -- cgit v1.2.3 From 0c059eff813ce99b6882cc6812a2770a2f45dff4 Mon Sep 17 00:00:00 2001 From: Anna Thomas Date: Thu, 2 Nov 2017 18:24:04 +0000 Subject: Strip off invariant.start because memory locations arent invariant The original change was reverted in rL317217 because of the failure in the RS4GC testcase. I couldn't reproduce the failure on my local machine (macbook) but could reproduce it on a linux box. The failure was around removing the uses of invariant.start. The fix here is to just RAUW undef (which was the first implementation in D39388). This is perfectly valid IR as discussed in the review. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317225 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/RewriteStatepointsForGC.cpp | 42 +++++++++++++++---- .../drop-invalid-metadata.ll | 48 ++++++++++++++++++++++ 2 files changed, 81 insertions(+), 9 deletions(-) diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 1ca77cfec32..44acfc88579 100644 --- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -125,10 +125,10 @@ struct RewriteStatepointsForGC : public ModulePass { Changed |= runOnFunction(F); if (Changed) { - // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn + // stripNonValidData asserts that shouldRewriteStatepointsIn // returns true for at least one function in the module. Since at least // one function changed, we know that the precondition is satisfied. - stripNonValidAttributesAndMetadata(M); + stripNonValidData(M); } return Changed; @@ -146,15 +146,17 @@ struct RewriteStatepointsForGC : public ModulePass { /// metadata implying dereferenceability that are no longer valid/correct after /// RewriteStatepointsForGC has run. This is because semantically, after /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire - /// heap. stripNonValidAttributesAndMetadata (conservatively) restores + /// heap. stripNonValidData (conservatively) restores /// correctness by erasing all attributes in the module that externally imply /// dereferenceability. Similar reasoning also applies to the noalias /// attributes and metadata. gc.statepoint can touch the entire heap including /// noalias objects. - void stripNonValidAttributesAndMetadata(Module &M); + /// Apart from attributes and metadata, we also remove instructions that imply + /// constant physical memory: llvm.invariant.start. + void stripNonValidData(Module &M); - // Helpers for stripNonValidAttributesAndMetadata - void stripNonValidAttributesAndMetadataFromBody(Function &F); + // Helpers for stripNonValidData + void stripNonValidDataFromBody(Function &F); void stripNonValidAttributesFromPrototype(Function &F); // Certain metadata on instructions are invalid after running RS4GC. @@ -2385,14 +2387,30 @@ void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC); } -void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) { +void RewriteStatepointsForGC::stripNonValidDataFromBody(Function &F) { if (F.empty()) return; LLVMContext &Ctx = F.getContext(); MDBuilder Builder(Ctx); + // Set of invariantstart instructions that we need to remove. + // Use this to avoid invalidating the instruction iterator. + SmallVector InvariantStartInstructions; + for (Instruction &I : instructions(F)) { + // invariant.start on memory location implies that the referenced memory + // location is constant and unchanging. This is no longer true after + // RewriteStatepointsForGC runs because there can be calls to gc.statepoint + // which frees the entire heap and the presence of invariant.start allows + // the optimizer to sink the load of a memory location past a statepoint, + // which is incorrect. + if (auto *II = dyn_cast(&I)) + if (II->getIntrinsicID() == Intrinsic::invariant_start) { + InvariantStartInstructions.push_back(II); + continue; + } + if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) { assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!"); bool IsImmutableTBAA = @@ -2422,6 +2440,12 @@ void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Functio RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex); } } + + // Delete the invariant.start instructions and RAUW undef. + for (auto *II : InvariantStartInstructions) { + II->replaceAllUsesWith(UndefValue::get(II->getType())); + II->eraseFromParent(); + } } /// Returns true if this function should be rewritten by this pass. The main @@ -2438,7 +2462,7 @@ static bool shouldRewriteStatepointsIn(Function &F) { return false; } -void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) { +void RewriteStatepointsForGC::stripNonValidData(Module &M) { #ifndef NDEBUG assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!"); #endif @@ -2447,7 +2471,7 @@ void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) { stripNonValidAttributesFromPrototype(F); for (Function &F : M) - stripNonValidAttributesAndMetadataFromBody(F); + stripNonValidDataFromBody(F); } bool RewriteStatepointsForGC::runOnFunction(Function &F) { diff --git a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll index 105afa9def5..ebc15865a67 100644 --- a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll +++ b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll @@ -75,6 +75,54 @@ define void @test_dereferenceable(i32 addrspace(1)* addrspace(1)* %p, i32 %x, i3 ret void } +; invariant.start allows us to sink the load past the baz statepoint call into taken block, which is +; incorrect. remove the invariant.start and RAUW undef. +define void @test_inv_start(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" { +; CHECK-LABEL: test_inv_start +; CHECK-NOT: invariant.start +; CHECK: gc.statepoint + %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p + %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1) + %v2 = load i32, i32 addrspace(1)* %v1 + call void @baz(i32 %x) + br i1 %cond, label %taken, label %untaken + +taken: + store i32 %v2, i32 addrspace(1)* %q, align 16 + call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1) + ret void + +; CHECK-LABEL: untaken: +; CHECK: gc.statepoint +untaken: + %foo = call i32 @escaping.invariant.start({}* %invst) + call void @dummy(i32 %foo) + ret void +} + +; invariant.start is removed and the uses are undef'ed. +define void @test_inv_start2(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" { +; CHECK-LABEL: test_inv_start2 +; CHECK-NOT: invariant.start +; CHECK: gc.statepoint + %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p + %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1) + %v2 = load i32, i32 addrspace(1)* %v1 + call void @baz(i32 %x) + br i1 %cond, label %taken, label %untaken + +taken: + store i32 %v2, i32 addrspace(1)* %q, align 16 + call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1) + ret void + +untaken: + ret void +} +declare {}* @llvm.invariant.start.p1i32(i64, i32 addrspace(1)* nocapture) nounwind readonly +declare void @llvm.invariant.end.p1i32({}*, i64, i32 addrspace(1)* nocapture) nounwind +declare i32 @escaping.invariant.start({}*) nounwind +declare void @dummy(i32) declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64, i32, void (i32)*, i32, i32, ...) ; Function Attrs: nounwind readonly -- cgit v1.2.3 From 2f759d471a7e20388901944f6b64d6c74c8a00ae Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 2 Nov 2017 18:44:54 +0000 Subject: [dsymutil][doc] Improve wording in manpage and rename file. - Improve wording - Rename llvm-dsymutil to dsymutil - Name -arch= argument Differential revision: https://reviews.llvm.org/D39561 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317226 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/CMakeLists.txt | 2 +- docs/CommandGuide/dsymutil.rst | 89 ++++++++++++++++++++++++++++++++++++ docs/CommandGuide/index.rst | 2 +- docs/CommandGuide/llvm-dsymutil.rst | 86 ---------------------------------- docs/CommandGuide/llvm-dwarfdump.rst | 2 +- test/tools/dsymutil/cmdline.test | 2 +- tools/dsymutil/dsymutil.cpp | 4 +- 7 files changed, 95 insertions(+), 92 deletions(-) create mode 100644 docs/CommandGuide/dsymutil.rst delete mode 100644 docs/CommandGuide/llvm-dsymutil.rst diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index 6e430459e5d..0f2681e0cd8 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -113,7 +113,7 @@ if (LLVM_ENABLE_SPHINX) if (${SPHINX_OUTPUT_MAN}) add_sphinx_target(man llvm) add_sphinx_target(man llvm-dwarfdump) - add_sphinx_target(man llvm-dsymutil) + add_sphinx_target(man dsymutil) endif() endif() diff --git a/docs/CommandGuide/dsymutil.rst b/docs/CommandGuide/dsymutil.rst new file mode 100644 index 00000000000..3cbbcb07894 --- /dev/null +++ b/docs/CommandGuide/dsymutil.rst @@ -0,0 +1,89 @@ +dsymutil - manipulate archived DWARF debug symbol files +======================================================= + +SYNOPSIS +-------- + +| :program:`dsymutil` [*options*] *executable* + +DESCRIPTION +----------- + +:program:`dsymutil` links the DWARF debug information found in the object files +for an executable *executable* by using debug symbols information contained in +its symbol table. By default, the linked debug information is placed in a +``.dSYM`` bundle with the same name as the executable. + +OPTIONS +------- +.. option:: -arch= + + Link DWARF debug information only for specified CPU architecture types. + Architectures may be specified by name. When using this option, an error will + be returned if any architectures can not be properly linked. This option can + be specified multiple times, once for each desired architecture. All CPU + architectures will be linked by default and any architectures that can't be + properly linked will cause :program:`dsymutil` to return an error. + +.. option:: -dump-debug-map + + Dump the executable debug-map (the list of the object files containing the + debug information) in YAML format and exit. Not DWARF link will take place. + +.. option:: -f, -flat + + Produce a flat dSYM file. A ``.dwarf`` extension will be appended to the + executable name unless the output file is specified using the -o option. + +.. option:: -no-odr + + Do not use ODR (One Definition Rule) for uniquing C++ types. + +.. option:: -no-output + + Do the link in memory, but do not emit the result file. + +.. option:: -no-swiftmodule-timestamp + + Don't check the timestamp for swiftmodule files. + +.. option:: -j , -num-threads= + + Specifies the maximum number (``n``) of simultaneous threads to use when + linking multiple architectures. + +.. option:: -o + + Specifies an alternate ``path`` to place the dSYM bundle. The default dSYM + bundle path is created by appending ``.dSYM`` to the executable name. + +.. option:: -oso-prepend-path= + + Specifies a ``path`` to prepend to all debug symbol object file paths. + +.. option:: -s, -symtab + + Dumps the symbol table found in executable or object file(s) and exits. + +.. option:: -v, -verbose + + Display verbose information when linking. + +.. option:: --version + + Display the version of the tool. + +.. option:: -y + + Treat *executable* as a YAML debug-map rather than an executable. + +EXIT STATUS +----------- + +:program:`dsymutil` returns 0 if the DWARF debug information was linked +successfully. Otherwise, it returns 1. + +SEE ALSO +-------- + +:manpage:`llvm-dwarfdump(1)` diff --git a/docs/CommandGuide/index.rst b/docs/CommandGuide/index.rst index a706ba1d675..805df00c173 100644 --- a/docs/CommandGuide/index.rst +++ b/docs/CommandGuide/index.rst @@ -30,7 +30,7 @@ Basic Commands llvm-stress llvm-symbolizer llvm-dwarfdump - llvm-dsymutil + dsymutil Debugging Tools ~~~~~~~~~~~~~~~ diff --git a/docs/CommandGuide/llvm-dsymutil.rst b/docs/CommandGuide/llvm-dsymutil.rst deleted file mode 100644 index 19340e194b8..00000000000 --- a/docs/CommandGuide/llvm-dsymutil.rst +++ /dev/null @@ -1,86 +0,0 @@ -llvm-dsymutil - manipulate archived DWARF debug symbol files -============================================================ - -SYNOPSIS --------- - -:program:`llvm-dsymutil` [*options*] [*filename*] - -DESCRIPTION ------------ - -:program:`llvm-dsymutil` links the DWARF debug information found in the object -files for the executable input file by using debug symbols information -contained in its symbol table. - -OPTIONS -------- -.. option:: -arch= - - Link DWARF debug information only for specified CPU architecture - types. This option can be specified multiple times, once for each - desired architecture. All cpu architectures will be linked by - default. - -.. option:: -dump-debug-map - - Parse and dump the debug map to standard output. Not DWARF link - will take place. - -.. option:: -f, -flat - - Produce a flat dSYM file (not a bundle). - -.. option:: -no-odr - - Do not use ODR (One Definition Rule) for type uniquing. - -.. option:: -no-output - - Do the link in memory, but do not emit the result file. - -.. option:: -no-swiftmodule-timestamp - - Don't check timestamp for swiftmodule files. - -.. option:: -j , -num-threads= - - Specifies the maximum number (n) of simultaneous threads to use - when linking multiple architectures. - -.. option:: -o= - - Specify the output file. default: .dwarf - -.. option:: -oso-prepend-path= - - Specify a directory to prepend to the paths of object files. - -.. option:: -s, -symtab - - Dumps the symbol table found in executable or object file(s) and - exits. - -.. option:: -v, -verbose - - Verbosity level - -.. option:: --version - - Display the version of the tool. - -.. option:: -y - - Treat the input file is a YAML debug map rather than a binary. - - -EXIT STATUS ------------ - -:program:`llvm-dsymutil` returns 0 if the DWARF debug information was linked -successfully. Otherwise, it returns 1. - -SEE ALSO --------- - -:manpage:`llvm-dwarfdump(1)` diff --git a/docs/CommandGuide/llvm-dwarfdump.rst b/docs/CommandGuide/llvm-dwarfdump.rst index 4e7791573e6..a3b62664cbe 100644 --- a/docs/CommandGuide/llvm-dwarfdump.rst +++ b/docs/CommandGuide/llvm-dwarfdump.rst @@ -139,4 +139,4 @@ successfully. Otherwise, it returns 1. SEE ALSO -------- -:manpage:`llvm-dsymutil(1)` +:manpage:`dsymutil(1)` diff --git a/test/tools/dsymutil/cmdline.test b/test/tools/dsymutil/cmdline.test index dea28cf3d90..f66858e9ae5 100644 --- a/test/tools/dsymutil/cmdline.test +++ b/test/tools/dsymutil/cmdline.test @@ -3,7 +3,7 @@ HELP: OVERVIEW: manipulate archived DWARF debug symbol files. HELP: USAGE: llvm-dsymutil{{[^ ]*}} [options] HELP-NOT: -reverse-iterate HELP: Specific Options: -HELP: -arch= +HELP: -arch= HELP: -dump-debug-map HELP: -flat HELP: -no-odr diff --git a/tools/dsymutil/dsymutil.cpp b/tools/dsymutil/dsymutil.cpp index b6d6c909abc..769668c8a9f 100644 --- a/tools/dsymutil/dsymutil.cpp +++ b/tools/dsymutil/dsymutil.cpp @@ -93,8 +93,8 @@ static list ArchFlags( "arch", desc("Link DWARF debug information only for specified CPU architecture\n" "types. This option can be specified multiple times, once for each\n" - "desired architecture. All cpu architectures will be linked by\n" - "default."), + "desired architecture. All CPU architectures will be linked by\n" + "default."), value_desc("arch"), ZeroOrMore, cat(DsymCategory)); static opt -- cgit v1.2.3 From 161385fddde9e408c5ec777889c24c435e811332 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 2 Nov 2017 19:13:32 +0000 Subject: [X86] Change getHostCPUName fallback code to not select 'x86-64' for unknown CPUs in family 6 that has 64-bit support but not any newer SSE features. Use 'core2' instead We know that's the earliest CPU with 64-bit support. x86-64 has taken on a role of representing a more modern 64-bit CPU so we probably shouldn't be using that when we can't identify things. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317229 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/Host.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index d8fb3e1dc1d..7fbe9ad6a5c 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -794,8 +794,13 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, break; } if (Features2 & (1 << (FEATURE_EM64T - 32))) { - *Type = INTEL_X86_64; - break; // x86-64 + *Type = INTEL_CORE2; // "core2" + *Subtype = INTEL_CORE2_65; + break; + } + if (Features & (1 << FEATURE_SSE3)) { + *Type = INTEL_CORE_DUO; + break; } if (Features & (1 << FEATURE_SSE2)) { *Type = INTEL_PENTIUM_M; -- cgit v1.2.3 From 2bbdf002305de458a85a418024608a57183284fe Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 2 Nov 2017 19:13:34 +0000 Subject: [X86] Simplify the pentium4 code in getHostCPUName to be based on feature flags. Don't use 'x86-64' ever. 'x86-64' has started to reflect a sort of generic tuning flag for more modern 64-bit CPUs. We probably shouldn't be using it as the name of an unidentifiable pentium4. So use nocona for all 64-bit pentium4s instead. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317230 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/Host.cpp | 40 ++++++---------------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index 7fbe9ad6a5c..c167df5a444 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -351,7 +351,6 @@ enum ProcessorTypes { INTEL_PENTIUM_IV, INTEL_PENTIUM_M, INTEL_CORE_DUO, - INTEL_X86_64, INTEL_NOCONA, INTEL_PRESCOTT, AMD_i486, @@ -819,40 +818,15 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, } break; case 15: { - switch (Model) { - case 0: // Pentium 4 processor, Intel Xeon processor. All processors are - // model 00h and manufactured using the 0.18 micron process. - case 1: // Pentium 4 processor, Intel Xeon processor, Intel Xeon - // processor MP, and Intel Celeron processor. All processors are - // model 01h and manufactured using the 0.18 micron process. - case 2: // Pentium 4 processor, Mobile Intel Pentium 4 processor - M, - // Intel Xeon processor, Intel Xeon processor MP, Intel Celeron - // processor, and Mobile Intel Celeron processor. All processors - // are model 02h and manufactured using the 0.13 micron process. - *Type = ((Features2 & (1 << (FEATURE_EM64T - 32))) ? INTEL_X86_64 - : INTEL_PENTIUM_IV); - break; - - case 3: // Pentium 4 processor, Intel Xeon processor, Intel Celeron D - // processor. All processors are model 03h and manufactured using - // the 90 nm process. - case 4: // Pentium 4 processor, Pentium 4 processor Extreme Edition, - // Pentium D processor, Intel Xeon processor, Intel Xeon - // processor MP, Intel Celeron D processor. All processors are - // model 04h and manufactured using the 90 nm process. - case 6: // Pentium 4 processor, Pentium D processor, Pentium processor - // Extreme Edition, Intel Xeon processor, Intel Xeon processor - // MP, Intel Celeron D processor. All processors are model 06h - // and manufactured using the 65 nm process. - *Type = ((Features2 & (1 << (FEATURE_EM64T - 32))) ? INTEL_NOCONA - : INTEL_PRESCOTT); + if (Features2 & (1 << (FEATURE_EM64T - 32))) { + *Type = INTEL_NOCONA; break; - - default: - *Type = ((Features2 & (1 << (FEATURE_EM64T - 32))) ? INTEL_X86_64 - : INTEL_PENTIUM_IV); + } + if (Features & (1 << FEATURE_SSE3)) { + *Type = INTEL_PRESCOTT; break; } + *Type = INTEL_PENTIUM_IV; break; } default: @@ -1150,8 +1124,6 @@ StringRef sys::getHostCPUName() { return "knl"; case INTEL_KNM: return "knm"; - case INTEL_X86_64: - return "x86-64"; case INTEL_NOCONA: return "nocona"; case INTEL_PRESCOTT: -- cgit v1.2.3 From ce68f2c6292da52ec79f7318092f4b3f8bc02dd7 Mon Sep 17 00:00:00 2001 From: Martin Storsjo Date: Thu, 2 Nov 2017 20:05:20 +0000 Subject: [test] Move llvm-lib tests into tools/llvm-lib. NFC. Similarly to SVN r317189 for llvm-dlltool, these are probably easier to find in a tools subdirectory with a name identical to the tool, than in a toplevel directory with a different name. This matches the move of LibDriver itself in SVN r302995. Differential Revision: https://reviews.llvm.org/D39531 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317262 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/LibDriver/Inputs/a.s | 2 -- test/LibDriver/Inputs/b.s | 2 -- test/LibDriver/Inputs/cl-gl.obj | Bin 3734 -> 0 bytes test/LibDriver/Inputs/resource.res | Bin 108 -> 0 bytes test/LibDriver/infer-output-path.test | 16 ---------------- test/LibDriver/invalid.test | 2 -- test/LibDriver/libpath.test | 15 --------------- test/LibDriver/lit.local.cfg | 3 --- test/LibDriver/no-inputs.test | 2 -- test/LibDriver/resource.test | 3 --- test/LibDriver/thin.test | 9 --------- test/LibDriver/use-paths.test | 24 ------------------------ test/tools/llvm-lib/Inputs/a.s | 2 ++ test/tools/llvm-lib/Inputs/b.s | 2 ++ test/tools/llvm-lib/Inputs/cl-gl.obj | Bin 0 -> 3734 bytes test/tools/llvm-lib/Inputs/resource.res | Bin 0 -> 108 bytes test/tools/llvm-lib/infer-output-path.test | 16 ++++++++++++++++ test/tools/llvm-lib/invalid.test | 2 ++ test/tools/llvm-lib/libpath.test | 15 +++++++++++++++ test/tools/llvm-lib/lit.local.cfg | 3 +++ test/tools/llvm-lib/no-inputs.test | 2 ++ test/tools/llvm-lib/resource.test | 3 +++ test/tools/llvm-lib/thin.test | 9 +++++++++ test/tools/llvm-lib/use-paths.test | 24 ++++++++++++++++++++++++ 24 files changed, 78 insertions(+), 78 deletions(-) delete mode 100644 test/LibDriver/Inputs/a.s delete mode 100644 test/LibDriver/Inputs/b.s delete mode 100755 test/LibDriver/Inputs/cl-gl.obj delete mode 100644 test/LibDriver/Inputs/resource.res delete mode 100644 test/LibDriver/infer-output-path.test delete mode 100644 test/LibDriver/invalid.test delete mode 100644 test/LibDriver/libpath.test delete mode 100644 test/LibDriver/lit.local.cfg delete mode 100644 test/LibDriver/no-inputs.test delete mode 100644 test/LibDriver/resource.test delete mode 100644 test/LibDriver/thin.test delete mode 100644 test/LibDriver/use-paths.test create mode 100644 test/tools/llvm-lib/Inputs/a.s create mode 100644 test/tools/llvm-lib/Inputs/b.s create mode 100755 test/tools/llvm-lib/Inputs/cl-gl.obj create mode 100644 test/tools/llvm-lib/Inputs/resource.res create mode 100644 test/tools/llvm-lib/infer-output-path.test create mode 100644 test/tools/llvm-lib/invalid.test create mode 100644 test/tools/llvm-lib/libpath.test create mode 100644 test/tools/llvm-lib/lit.local.cfg create mode 100644 test/tools/llvm-lib/no-inputs.test create mode 100644 test/tools/llvm-lib/resource.test create mode 100644 test/tools/llvm-lib/thin.test create mode 100644 test/tools/llvm-lib/use-paths.test diff --git a/test/LibDriver/Inputs/a.s b/test/LibDriver/Inputs/a.s deleted file mode 100644 index 88258e2797f..00000000000 --- a/test/LibDriver/Inputs/a.s +++ /dev/null @@ -1,2 +0,0 @@ -.globl a -a: diff --git a/test/LibDriver/Inputs/b.s b/test/LibDriver/Inputs/b.s deleted file mode 100644 index 4890c9247c7..00000000000 --- a/test/LibDriver/Inputs/b.s +++ /dev/null @@ -1,2 +0,0 @@ -.globl b -b: diff --git a/test/LibDriver/Inputs/cl-gl.obj b/test/LibDriver/Inputs/cl-gl.obj deleted file mode 100755 index ff746557d41..00000000000 Binary files a/test/LibDriver/Inputs/cl-gl.obj and /dev/null differ diff --git a/test/LibDriver/Inputs/resource.res b/test/LibDriver/Inputs/resource.res deleted file mode 100644 index f1c799fbbb0..00000000000 Binary files a/test/LibDriver/Inputs/resource.res and /dev/null differ diff --git a/test/LibDriver/infer-output-path.test b/test/LibDriver/infer-output-path.test deleted file mode 100644 index c63b0abdf6e..00000000000 --- a/test/LibDriver/infer-output-path.test +++ /dev/null @@ -1,16 +0,0 @@ -RUN: rm -rf %t && mkdir -p %t -RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t/a.obj %S/Inputs/a.s -RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t/b.o %S/Inputs/b.s -RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t/c %S/Inputs/b.s - -RUN: rm -f %t/a.lib -RUN: llvm-lib %t/a.obj -RUN: test -e %t/a.lib - -RUN: rm -f %t/b.lib -RUN: llvm-lib /libpath:%t b.o -RUN: test -e %t/b.lib - -RUN: rm -f %t/c.lib -RUN: llvm-lib /libpath:%t c -RUN: test -e %t/c.lib diff --git a/test/LibDriver/invalid.test b/test/LibDriver/invalid.test deleted file mode 100644 index 2978177a431..00000000000 --- a/test/LibDriver/invalid.test +++ /dev/null @@ -1,2 +0,0 @@ -RUN: not llvm-lib %S/Inputs/cl-gl.obj 2>&1 | FileCheck %s -CHECK: not a COFF object, bitcode or resource file diff --git a/test/LibDriver/libpath.test b/test/LibDriver/libpath.test deleted file mode 100644 index 26a1e8dc8b6..00000000000 --- a/test/LibDriver/libpath.test +++ /dev/null @@ -1,15 +0,0 @@ -RUN: mkdir -p %t/a %t/b -RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t/a/foo.obj %S/Inputs/a.s -RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t/b/foo.obj %S/Inputs/b.s - -RUN: env "LIB=%t/a;%t/b" llvm-lib /out:%t1.lib foo.obj -RUN: llvm-nm %t1.lib | FileCheck --check-prefix=A %s - -RUN: llvm-lib /out:%t2.lib /libpath:%t/a /libpath:%t/b foo.obj -RUN: llvm-nm %t2.lib | FileCheck --check-prefix=A %s - -RUN: env LIB=%t/a llvm-lib /libpath:%t/b /out:%t3.lib foo.obj -RUN: llvm-nm %t3.lib | FileCheck --check-prefix=B %s - -A: T a -B: T b diff --git a/test/LibDriver/lit.local.cfg b/test/LibDriver/lit.local.cfg deleted file mode 100644 index e71f3cc4c41..00000000000 --- a/test/LibDriver/lit.local.cfg +++ /dev/null @@ -1,3 +0,0 @@ -if not 'X86' in config.root.targets: - config.unsupported = True - diff --git a/test/LibDriver/no-inputs.test b/test/LibDriver/no-inputs.test deleted file mode 100644 index 95d6555d58c..00000000000 --- a/test/LibDriver/no-inputs.test +++ /dev/null @@ -1,2 +0,0 @@ -RUN: llvm-lib -out:%t.a -RUN: test ! -e %t.a diff --git a/test/LibDriver/resource.test b/test/LibDriver/resource.test deleted file mode 100644 index 6c3dad50b45..00000000000 --- a/test/LibDriver/resource.test +++ /dev/null @@ -1,3 +0,0 @@ -RUN: llvm-lib /out:%t %S/Inputs/resource.res -RUN: llvm-ar t %t | FileCheck %s -CHECK: resource.res diff --git a/test/LibDriver/thin.test b/test/LibDriver/thin.test deleted file mode 100644 index c401de41a80..00000000000 --- a/test/LibDriver/thin.test +++ /dev/null @@ -1,9 +0,0 @@ -RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t %S/Inputs/a.s - -RUN: llvm-lib -out:%t.a %t -RUN: FileCheck --check-prefix=FAT %s < %t.a -FAT: ! - -RUN: llvm-lib -out:%t.thin.a -llvmlibthin %t -RUN: FileCheck --check-prefix=THIN %s < %t.thin.a -THIN: ! diff --git a/test/LibDriver/use-paths.test b/test/LibDriver/use-paths.test deleted file mode 100644 index 971c216127e..00000000000 --- a/test/LibDriver/use-paths.test +++ /dev/null @@ -1,24 +0,0 @@ -llvm-lib should behave like "link.exe /lib" and use relative paths to describe -archive members. - -First, get in a clean working directory. -RUN: rm -rf %t && mkdir -p %t && cd %t - -Make foo/a.obj and foo/b.obj. -RUN: mkdir foo -RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o foo/a.obj %S/Inputs/a.s -RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o foo/b.obj %S/Inputs/b.s - -RUN: llvm-lib -out:foo.lib foo/a.obj foo/b.obj -RUN: llvm-ar t foo.lib | FileCheck %s - -FIXME: We should probably use backslashes on Windows to better match MSVC tools. -CHECK: foo/a.obj -CHECK: foo/b.obj - -Do it again with absolute paths and see that we get something. -RUN: llvm-lib -out:foo.lib %t/foo/a.obj %t/foo/b.obj -RUN: llvm-ar t foo.lib | FileCheck %s --check-prefix=ABS - -ABS: {{.*}}/foo/a.obj -ABS: {{.*}}/foo/b.obj diff --git a/test/tools/llvm-lib/Inputs/a.s b/test/tools/llvm-lib/Inputs/a.s new file mode 100644 index 00000000000..88258e2797f --- /dev/null +++ b/test/tools/llvm-lib/Inputs/a.s @@ -0,0 +1,2 @@ +.globl a +a: diff --git a/test/tools/llvm-lib/Inputs/b.s b/test/tools/llvm-lib/Inputs/b.s new file mode 100644 index 00000000000..4890c9247c7 --- /dev/null +++ b/test/tools/llvm-lib/Inputs/b.s @@ -0,0 +1,2 @@ +.globl b +b: diff --git a/test/tools/llvm-lib/Inputs/cl-gl.obj b/test/tools/llvm-lib/Inputs/cl-gl.obj new file mode 100755 index 00000000000..ff746557d41 Binary files /dev/null and b/test/tools/llvm-lib/Inputs/cl-gl.obj differ diff --git a/test/tools/llvm-lib/Inputs/resource.res b/test/tools/llvm-lib/Inputs/resource.res new file mode 100644 index 00000000000..f1c799fbbb0 Binary files /dev/null and b/test/tools/llvm-lib/Inputs/resource.res differ diff --git a/test/tools/llvm-lib/infer-output-path.test b/test/tools/llvm-lib/infer-output-path.test new file mode 100644 index 00000000000..c63b0abdf6e --- /dev/null +++ b/test/tools/llvm-lib/infer-output-path.test @@ -0,0 +1,16 @@ +RUN: rm -rf %t && mkdir -p %t +RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t/a.obj %S/Inputs/a.s +RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t/b.o %S/Inputs/b.s +RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t/c %S/Inputs/b.s + +RUN: rm -f %t/a.lib +RUN: llvm-lib %t/a.obj +RUN: test -e %t/a.lib + +RUN: rm -f %t/b.lib +RUN: llvm-lib /libpath:%t b.o +RUN: test -e %t/b.lib + +RUN: rm -f %t/c.lib +RUN: llvm-lib /libpath:%t c +RUN: test -e %t/c.lib diff --git a/test/tools/llvm-lib/invalid.test b/test/tools/llvm-lib/invalid.test new file mode 100644 index 00000000000..2978177a431 --- /dev/null +++ b/test/tools/llvm-lib/invalid.test @@ -0,0 +1,2 @@ +RUN: not llvm-lib %S/Inputs/cl-gl.obj 2>&1 | FileCheck %s +CHECK: not a COFF object, bitcode or resource file diff --git a/test/tools/llvm-lib/libpath.test b/test/tools/llvm-lib/libpath.test new file mode 100644 index 00000000000..26a1e8dc8b6 --- /dev/null +++ b/test/tools/llvm-lib/libpath.test @@ -0,0 +1,15 @@ +RUN: mkdir -p %t/a %t/b +RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t/a/foo.obj %S/Inputs/a.s +RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t/b/foo.obj %S/Inputs/b.s + +RUN: env "LIB=%t/a;%t/b" llvm-lib /out:%t1.lib foo.obj +RUN: llvm-nm %t1.lib | FileCheck --check-prefix=A %s + +RUN: llvm-lib /out:%t2.lib /libpath:%t/a /libpath:%t/b foo.obj +RUN: llvm-nm %t2.lib | FileCheck --check-prefix=A %s + +RUN: env LIB=%t/a llvm-lib /libpath:%t/b /out:%t3.lib foo.obj +RUN: llvm-nm %t3.lib | FileCheck --check-prefix=B %s + +A: T a +B: T b diff --git a/test/tools/llvm-lib/lit.local.cfg b/test/tools/llvm-lib/lit.local.cfg new file mode 100644 index 00000000000..e71f3cc4c41 --- /dev/null +++ b/test/tools/llvm-lib/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'X86' in config.root.targets: + config.unsupported = True + diff --git a/test/tools/llvm-lib/no-inputs.test b/test/tools/llvm-lib/no-inputs.test new file mode 100644 index 00000000000..95d6555d58c --- /dev/null +++ b/test/tools/llvm-lib/no-inputs.test @@ -0,0 +1,2 @@ +RUN: llvm-lib -out:%t.a +RUN: test ! -e %t.a diff --git a/test/tools/llvm-lib/resource.test b/test/tools/llvm-lib/resource.test new file mode 100644 index 00000000000..6c3dad50b45 --- /dev/null +++ b/test/tools/llvm-lib/resource.test @@ -0,0 +1,3 @@ +RUN: llvm-lib /out:%t %S/Inputs/resource.res +RUN: llvm-ar t %t | FileCheck %s +CHECK: resource.res diff --git a/test/tools/llvm-lib/thin.test b/test/tools/llvm-lib/thin.test new file mode 100644 index 00000000000..c401de41a80 --- /dev/null +++ b/test/tools/llvm-lib/thin.test @@ -0,0 +1,9 @@ +RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t %S/Inputs/a.s + +RUN: llvm-lib -out:%t.a %t +RUN: FileCheck --check-prefix=FAT %s < %t.a +FAT: ! + +RUN: llvm-lib -out:%t.thin.a -llvmlibthin %t +RUN: FileCheck --check-prefix=THIN %s < %t.thin.a +THIN: ! diff --git a/test/tools/llvm-lib/use-paths.test b/test/tools/llvm-lib/use-paths.test new file mode 100644 index 00000000000..971c216127e --- /dev/null +++ b/test/tools/llvm-lib/use-paths.test @@ -0,0 +1,24 @@ +llvm-lib should behave like "link.exe /lib" and use relative paths to describe +archive members. + +First, get in a clean working directory. +RUN: rm -rf %t && mkdir -p %t && cd %t + +Make foo/a.obj and foo/b.obj. +RUN: mkdir foo +RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o foo/a.obj %S/Inputs/a.s +RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o foo/b.obj %S/Inputs/b.s + +RUN: llvm-lib -out:foo.lib foo/a.obj foo/b.obj +RUN: llvm-ar t foo.lib | FileCheck %s + +FIXME: We should probably use backslashes on Windows to better match MSVC tools. +CHECK: foo/a.obj +CHECK: foo/b.obj + +Do it again with absolute paths and see that we get something. +RUN: llvm-lib -out:foo.lib %t/foo/a.obj %t/foo/b.obj +RUN: llvm-ar t foo.lib | FileCheck %s --check-prefix=ABS + +ABS: {{.*}}/foo/a.obj +ABS: {{.*}}/foo/b.obj -- cgit v1.2.3 From 15f5deb8cb6fb5c575a3c7cda87a5a723b5ada2b Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Thu, 2 Nov 2017 20:22:03 +0000 Subject: Fix llvm-dsymutil test in -DLLVM_ENABLE_THREADS=OFF mode After r316999, tools/dsymutil/X86/alias.test started failing in builds that have threading disabled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317263 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/dsymutil/dsymutil.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/dsymutil/dsymutil.cpp b/tools/dsymutil/dsymutil.cpp index 769668c8a9f..9d9a2418379 100644 --- a/tools/dsymutil/dsymutil.cpp +++ b/tools/dsymutil/dsymutil.cpp @@ -338,7 +338,6 @@ int main(int argc, char **argv) { NumThreads = 1; NumThreads = std::min(NumThreads, DebugMapPtrsOrErr->size()); - llvm::ThreadPool Threads(NumThreads); // If there is more than one link to execute, we need to generate // temporary files. @@ -366,17 +365,19 @@ int main(int argc, char **argv) { // FIXME: The DwarfLinker can have some very deep recursion that can max // out the (significantly smaller) stack when using threads. We don't // want this limitation when we only have a single thread. - if (NumThreads == 1) + if (NumThreads == 1) { LinkLambda(); - else + } else { + llvm::ThreadPool Threads(NumThreads); Threads.async(LinkLambda); + Threads.wait(); + } if (NeedsTempFiles) TempFiles.emplace_back(Map->getTriple().getArchName().str(), OutputFile); } - Threads.wait(); if (NeedsTempFiles && !MachOUtils::generateUniversalBinary( -- cgit v1.2.3 From c626458f76209b25c24dcbeb4545534f8bc120ba Mon Sep 17 00:00:00 2001 From: Shoaib Meenai Date: Thu, 2 Nov 2017 20:33:36 +0000 Subject: [cmake] Remove policy conditionals LLVM now requires a minimum of cmake 3.4.3, and all the policies currently being set are present in that cmake version, so the conditionals will always be true and are therefore unnecessary. The movation is that the conditionals can give the false impression that the policy settings are optional, whereas for example it's necessary to set CMP0056 in order for `check_linker_flags` to operate correctly after r316972. Inline the project version and language setting in the process. Differential Revision: https://reviews.llvm.org/D39442 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317264 91177308-0d34-0410-b5e6-96231b3b80d8 --- CMakeLists.txt | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 04565038311..e27562dc8b5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,26 +2,20 @@ cmake_minimum_required(VERSION 3.4.3) -if(POLICY CMP0022) - cmake_policy(SET CMP0022 NEW) # automatic when 2.8.12 is required -endif() +cmake_policy(SET CMP0022 NEW) -if (POLICY CMP0051) - # CMake 3.1 and higher include generator expressions of the form - # $ in the SOURCES property. These need to be - # stripped everywhere that access the SOURCES property, so we just - # defer to the OLD behavior of not including generator expressions - # in the output for now. - cmake_policy(SET CMP0051 OLD) -endif() +cmake_policy(SET CMP0048 NEW) -if(POLICY CMP0056) - cmake_policy(SET CMP0056 NEW) -endif() +# CMake 3.1 and higher include generator expressions of the form +# $ in the SOURCES property. These need to be +# stripped everywhere that access the SOURCES property, so we just +# defer to the OLD behavior of not including generator expressions +# in the output for now. +cmake_policy(SET CMP0051 OLD) -if(POLICY CMP0057) - cmake_policy(SET CMP0057 NEW) -endif() +cmake_policy(SET CMP0056 NEW) + +cmake_policy(SET CMP0057 NEW) if(NOT DEFINED LLVM_VERSION_MAJOR) set(LLVM_VERSION_MAJOR 6) @@ -36,13 +30,6 @@ if(NOT DEFINED LLVM_VERSION_SUFFIX) set(LLVM_VERSION_SUFFIX svn) endif() -if (POLICY CMP0048) - cmake_policy(SET CMP0048 NEW) - set(cmake_3_0_PROJ_VERSION - VERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}) - set(cmake_3_0_LANGUAGES LANGUAGES) -endif() - if (NOT PACKAGE_VERSION) set(PACKAGE_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX}") @@ -56,9 +43,8 @@ if ((CMAKE_GENERATOR MATCHES "Visual Studio") AND (CMAKE_GENERATOR_TOOLSET STREQ endif() project(LLVM - ${cmake_3_0_PROJ_VERSION} - ${cmake_3_0_LANGUAGES} - C CXX ASM) + VERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH} + LANGUAGES C CXX ASM) if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) message(STATUS "No build type selected, default to Debug") -- cgit v1.2.3 From 37bbee84d83c14043e07ea9d76bb7789c697eb6d Mon Sep 17 00:00:00 2001 From: Konstantin Zhuravlyov Date: Thu, 2 Nov 2017 20:48:06 +0000 Subject: AMDGPU: Remove outdated fixme (it was already fixed) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317266 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIMachineFunctionInfo.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index ade909cc84e..5f5636e119a 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -87,9 +87,6 @@ public: /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo final : public AMDGPUMachineFunction { - // FIXME: This should be removed and getPreloadedValue moved here. - friend class SIRegisterInfo; - unsigned TIDReg = AMDGPU::NoRegister; // Registers that may be reserved for spilling purposes. These may be the same -- cgit v1.2.3 From 2e63034efd79807891a4d201daeb434c2d26c609 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 2 Nov 2017 20:58:58 +0000 Subject: Add missing header guards. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317267 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm-c/DebugInfo.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/llvm-c/DebugInfo.h b/include/llvm-c/DebugInfo.h index 15f6b57d883..2c2fdbdf173 100644 --- a/include/llvm-c/DebugInfo.h +++ b/include/llvm-c/DebugInfo.h @@ -14,6 +14,9 @@ /// //===----------------------------------------------------------------------===// +#ifndef LLVM_C_DEBUGINFO_H +#define LLVM_C_DEBUGINFO_H + #include "llvm-c/Core.h" #ifdef __cplusplus @@ -200,3 +203,5 @@ LLVMDIBuilderCreateDebugLocation(LLVMContextRef Ctx, unsigned Line, #ifdef __cplusplus } // end extern "C" #endif + +#endif -- cgit v1.2.3 From b69a2a9ae35ca7d19399c8d23287f10a8bdf0f45 Mon Sep 17 00:00:00 2001 From: Anna Thomas Date: Thu, 2 Nov 2017 21:21:02 +0000 Subject: [LoopPredication] Enable predication when latchCheckIV is wider than rangeCheck Summary: This patch allows us to predicate range checks that have a type narrower than the latch check type. We leverage SCEV analysis to identify a truncate for the latchLimit and latchStart. There is also safety checks in place which requires the start and limit to be known at compile time. We require this to make sure that the SCEV truncate expr for the IV corresponding to the latch does not cause us to lose information about the IV range. Added tests show the loop predication over range checks that are of various types and are narrower than the latch type. This enhancement has been in our downstream tree for a while. Reviewers: apilipenko, sanjoy, mkazantsev Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D39500 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317269 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/LoopPredication.cpp | 106 +++++++++++++++++++--- test/Transforms/LoopPredication/widened.ll | 138 +++++++++++++++++++++++++++++ 2 files changed, 234 insertions(+), 10 deletions(-) create mode 100644 test/Transforms/LoopPredication/widened.ll diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp index 9a623be234f..e680fbed113 100644 --- a/lib/Transforms/Scalar/LoopPredication.cpp +++ b/lib/Transforms/Scalar/LoopPredication.cpp @@ -174,6 +174,9 @@ using namespace llvm; +static cl::opt EnableIVTruncation("loop-predication-enable-iv-truncation", + cl::Hidden, cl::init(true)); + namespace { class LoopPredication { /// Represents an induction variable check: @@ -212,6 +215,22 @@ class LoopPredication { IRBuilder<> &Builder); bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander); + // When the IV type is wider than the range operand type, we can still do loop + // predication, by generating SCEVs for the range and latch that are of the + // same type. We achieve this by generating a SCEV truncate expression for the + // latch IV. This is done iff truncation of the IV is a safe operation, + // without loss of information. + // Another way to achieve this is by generating a wider type SCEV for the + // range check operand, however, this needs a more involved check that + // operands do not overflow. This can lead to loss of information when the + // range operand is of the form: add i32 %offset, %iv. We need to prove that + // sext(x + y) is same as sext(x) + sext(y). + // This function returns true if we can safely represent the IV type in + // the RangeCheckType without loss of information. + bool isSafeToTruncateWideIVType(Type *RangeCheckType); + // Return the loopLatchCheck corresponding to the RangeCheckType if safe to do + // so. + Optional generateLoopLatchCheck(Type *RangeCheckType); public: LoopPredication(ScalarEvolution *SE) : SE(SE){}; bool runOnLoop(Loop *L); @@ -301,6 +320,34 @@ Value *LoopPredication::expandCheck(SCEVExpander &Expander, return Builder.CreateICmp(Pred, LHSV, RHSV); } +Optional +LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) { + + auto *LatchType = LatchCheck.IV->getType(); + if (RangeCheckType == LatchType) + return LatchCheck; + // For now, bail out if latch type is narrower than range type. + if (DL->getTypeSizeInBits(LatchType) < DL->getTypeSizeInBits(RangeCheckType)) + return None; + if (!isSafeToTruncateWideIVType(RangeCheckType)) + return None; + // We can now safely identify the truncated version of the IV and limit for + // RangeCheckType. + LoopICmp NewLatchCheck; + NewLatchCheck.Pred = LatchCheck.Pred; + NewLatchCheck.IV = dyn_cast( + SE->getTruncateExpr(LatchCheck.IV, RangeCheckType)); + if (!NewLatchCheck.IV) + return None; + NewLatchCheck.Limit = SE->getTruncateExpr(LatchCheck.Limit, RangeCheckType); + DEBUG(dbgs() << "IV of type: " << *LatchType + << "can be represented as range check type:" << *RangeCheckType + << "\n"); + DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n"); + DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n"); + return NewLatchCheck; +} + /// If ICI can be widened to a loop invariant condition emits the loop /// invariant condition in the loop preheader and return it, otherwise /// returns None. @@ -325,22 +372,31 @@ Optional LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, return None; } auto *RangeCheckIV = RangeCheck->IV; - auto *Ty = RangeCheckIV->getType(); - if (Ty != LatchCheck.IV->getType()) { - DEBUG(dbgs() << "Type mismatch between range check and latch IVs!\n"); - return None; - } if (!RangeCheckIV->isAffine()) { DEBUG(dbgs() << "Range check IV is not affine!\n"); return None; } auto *Step = RangeCheckIV->getStepRecurrence(*SE); - if (Step != LatchCheck.IV->getStepRecurrence(*SE)) { + // We cannot just compare with latch IV step because the latch and range IVs + // may have different types. + if (!Step->isOne()) { DEBUG(dbgs() << "Range check and latch have IVs different steps!\n"); return None; } - assert(Step->isOne() && "must be one"); + auto *Ty = RangeCheckIV->getType(); + auto CurrLatchCheckOpt = generateLoopLatchCheck(Ty); + if (!CurrLatchCheckOpt) { + DEBUG(dbgs() << "Failed to generate a loop latch check " + "corresponding to range type: " + << *Ty << "\n"); + return None; + } + LoopICmp CurrLatchCheck = *CurrLatchCheckOpt; + // At this point the range check step and latch step should have the same + // value and type. + assert(Step == CurrLatchCheck.IV->getStepRecurrence(*SE) && + "Range and latch should have same step recurrence!"); // Generate the widened condition: // guardStart u< guardLimit && // latchLimit guardLimit - 1 - guardStart + latchStart @@ -348,8 +404,8 @@ Optional LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, // header comment for the reasoning. const SCEV *GuardStart = RangeCheckIV->getStart(); const SCEV *GuardLimit = RangeCheck->Limit; - const SCEV *LatchStart = LatchCheck.IV->getStart(); - const SCEV *LatchLimit = LatchCheck.Limit; + const SCEV *LatchStart = CurrLatchCheck.IV->getStart(); + const SCEV *LatchLimit = CurrLatchCheck.Limit; // guardLimit - guardStart + latchStart - 1 const SCEV *RHS = @@ -357,7 +413,7 @@ Optional LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, SE->getMinusSCEV(LatchStart, SE->getOne(Ty))); ICmpInst::Predicate LimitCheckPred; - switch (LatchCheck.Pred) { + switch (CurrLatchCheck.Pred) { case ICmpInst::ICMP_ULT: LimitCheckPred = ICmpInst::ICMP_ULE; break; @@ -510,6 +566,36 @@ Optional LoopPredication::parseLoopLatchICmp() { return Result; } +// Returns true if its safe to truncate the IV to RangeCheckType. +bool LoopPredication::isSafeToTruncateWideIVType(Type *RangeCheckType) { + if (!EnableIVTruncation) + return false; + assert(DL->getTypeSizeInBits(LatchCheck.IV->getType()) > + DL->getTypeSizeInBits(RangeCheckType) && + "Expected latch check IV type to be larger than range check operand " + "type!"); + // The start and end values of the IV should be known. This is to guarantee + // that truncating the wide type will not lose information. + auto *Limit = dyn_cast(LatchCheck.Limit); + auto *Start = dyn_cast(LatchCheck.IV->getStart()); + if (!Limit || !Start) + return false; + // This check makes sure that the IV does not change sign during loop + // iterations. Consider latchType = i64, LatchStart = 5, Pred = ICMP_SGE, + // LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the + // IV wraps around, and the truncation of the IV would lose the range of + // iterations between 2^32 and 2^64. + bool Increasing; + if (!SE->isMonotonicPredicate(LatchCheck.IV, LatchCheck.Pred, Increasing)) + return false; + // The active bits should be less than the bits in the RangeCheckType. This + // guarantees that truncating the latch check to RangeCheckType is a safe + // operation. + auto RangeCheckTypeBitSize = DL->getTypeSizeInBits(RangeCheckType); + return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize && + Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize; +} + bool LoopPredication::runOnLoop(Loop *Loop) { L = Loop; diff --git a/test/Transforms/LoopPredication/widened.ll b/test/Transforms/LoopPredication/widened.ll new file mode 100644 index 00000000000..33c4e270613 --- /dev/null +++ b/test/Transforms/LoopPredication/widened.ll @@ -0,0 +1,138 @@ +; RUN: opt -S -loop-predication -loop-predication-enable-iv-truncation=true < %s 2>&1 | FileCheck %s +declare void @llvm.experimental.guard(i1, ...) + +declare i32 @length(i8*) + +declare i16 @short_length(i8*) +; Consider range check of type i16 and i32, while IV is of type i64 +; We can loop predicate this because the IV range is within i16 and within i32. +define i64 @iv_wider_type_rc_two_narrow_types(i32 %offA, i16 %offB, i8* %arrA, i8* %arrB) { +; CHECK-LABEL: iv_wider_type_rc_two_narrow_types +entry: +; CHECK-LABEL: entry: +; CHECK: [[idxB:[^ ]+]] = sub i16 %lengthB, %offB +; CHECK-NEXT: [[limit_checkB:[^ ]+]] = icmp ule i16 16, [[idxB]] +; CHECK-NEXT: [[first_iteration_checkB:[^ ]+]] = icmp ult i16 %offB, %lengthB +; CHECK-NEXT: [[WideChkB:[^ ]+]] = and i1 [[first_iteration_checkB]], [[limit_checkB]] +; CHECK-NEXT: [[idxA:[^ ]+]] = sub i32 %lengthA, %offA +; CHECK-NEXT: [[limit_checkA:[^ ]+]] = icmp ule i32 16, [[idxA]] +; CHECK-NEXT: [[first_iteration_checkA:[^ ]+]] = icmp ult i32 %offA, %lengthA +; CHECK-NEXT: [[WideChkA:[^ ]+]] = and i1 [[first_iteration_checkA]], [[limit_checkA]] + %lengthA = call i32 @length(i8* %arrA) + %lengthB = call i16 @short_length(i8* %arrB) + br label %loop + +loop: +; CHECK-LABEL: loop: +; CHECK: [[invariant_check:[^ ]+]] = and i1 [[WideChkB]], [[WideChkA]] +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[invariant_check]], i32 9) + %iv = phi i64 [0, %entry ], [ %iv.next, %loop ] + %iv.trunc.32 = trunc i64 %iv to i32 + %iv.trunc.16 = trunc i64 %iv to i16 + %indexA = add i32 %iv.trunc.32, %offA + %indexB = add i16 %iv.trunc.16, %offB + %rcA = icmp ult i32 %indexA, %lengthA + %rcB = icmp ult i16 %indexB, %lengthB + %wide.chk = and i1 %rcA, %rcB + call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk, i32 9) [ "deopt"() ] + %indexA.ext = zext i32 %indexA to i64 + %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext + %eltA = load i8, i8* %addrA + %indexB.ext = zext i16 %indexB to i64 + %addrB = getelementptr inbounds i8, i8* %arrB, i64 %indexB.ext + store i8 %eltA, i8* %addrB + %iv.next = add nuw nsw i64 %iv, 1 + %latch.check = icmp ult i64 %iv.next, 16 + br i1 %latch.check, label %loop, label %exit + +exit: + ret i64 %iv +} + + +; Consider an IV of type long and an array access into int array. +; IV is of type i64 while the range check operands are of type i32 and i64. +define i64 @iv_rc_different_types(i32 %offA, i32 %offB, i8* %arrA, i8* %arrB, i64 %max) +{ +; CHECK-LABEL: iv_rc_different_types +entry: +; CHECK-LABEL: entry: +; CHECK: [[lenB:[^ ]+]] = add i32 %lengthB, -1 +; CHECK-NEXT: [[idxB:[^ ]+]] = sub i32 [[lenB]], %offB +; CHECK-NEXT: [[limit_checkB:[^ ]+]] = icmp ule i32 15, [[idxB]] +; CHECK-NEXT: [[first_iteration_checkB:[^ ]+]] = icmp ult i32 %offB, %lengthB +; CHECK-NEXT: [[WideChkB:[^ ]+]] = and i1 [[first_iteration_checkB]], [[limit_checkB]] +; CHECK-NEXT: [[maxMinusOne:[^ ]+]] = add i64 %max, -1 +; CHECK-NEXT: [[limit_checkMax:[^ ]+]] = icmp ule i64 15, [[maxMinusOne]] +; CHECK-NEXT: [[first_iteration_checkMax:[^ ]+]] = icmp ult i64 0, %max +; CHECK-NEXT: [[WideChkMax:[^ ]+]] = and i1 [[first_iteration_checkMax]], [[limit_checkMax]] +; CHECK-NEXT: [[lenA:[^ ]+]] = add i32 %lengthA, -1 +; CHECK-NEXT: [[idxA:[^ ]+]] = sub i32 [[lenA]], %offA +; CHECK-NEXT: [[limit_checkA:[^ ]+]] = icmp ule i32 15, [[idxA]] +; CHECK-NEXT: [[first_iteration_checkA:[^ ]+]] = icmp ult i32 %offA, %lengthA +; CHECK-NEXT: [[WideChkA:[^ ]+]] = and i1 [[first_iteration_checkA]], [[limit_checkA]] + %lengthA = call i32 @length(i8* %arrA) + %lengthB = call i32 @length(i8* %arrB) + br label %loop + +loop: +; CHECK-LABEL: loop: +; CHECK: [[BandMax:[^ ]+]] = and i1 [[WideChkB]], [[WideChkMax]] +; CHECK: [[ABandMax:[^ ]+]] = and i1 [[BandMax]], [[WideChkA]] +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[ABandMax]], i32 9) + %iv = phi i64 [0, %entry ], [ %iv.next, %loop ] + %iv.trunc = trunc i64 %iv to i32 + %indexA = add i32 %iv.trunc, %offA + %indexB = add i32 %iv.trunc, %offB + %rcA = icmp ult i32 %indexA, %lengthA + %rcIV = icmp ult i64 %iv, %max + %wide.chk = and i1 %rcA, %rcIV + %rcB = icmp ult i32 %indexB, %lengthB + %wide.chk.final = and i1 %wide.chk, %rcB + call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk.final, i32 9) [ "deopt"() ] + %indexA.ext = zext i32 %indexA to i64 + %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext + %eltA = load i8, i8* %addrA + %indexB.ext = zext i32 %indexB to i64 + %addrB = getelementptr inbounds i8, i8* %arrB, i64 %indexB.ext + %eltB = load i8, i8* %addrB + %result = xor i8 %eltA, %eltB + store i8 %result, i8* %addrA + %iv.next = add nuw nsw i64 %iv, 1 + %latch.check = icmp ult i64 %iv, 15 + br i1 %latch.check, label %loop, label %exit + +exit: + ret i64 %iv +} + +; cannot narrow the IV to the range type, because we lose information. +; for (i64 i= 5; i>= 2; i++) +; this loop wraps around after reaching 2^64. +define i64 @iv_rc_different_type(i32 %offA, i8* %arrA) { +; CHECK-LABEL: iv_rc_different_type +entry: + %lengthA = call i32 @length(i8* %arrA) + br label %loop + +loop: +; CHECK-LABEL: loop: +; CHECK: %rcA = icmp ult i32 %indexA, %lengthA +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %rcA, i32 9) + %iv = phi i64 [ 5, %entry ], [ %iv.next, %loop ] + %iv.trunc.32 = trunc i64 %iv to i32 + %indexA = add i32 %iv.trunc.32, %offA + %rcA = icmp ult i32 %indexA, %lengthA + call void (i1, ...) @llvm.experimental.guard(i1 %rcA, i32 9) [ "deopt"() ] + %indexA.ext = zext i32 %indexA to i64 + %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext + %eltA = load i8, i8* %addrA + %res = add i8 %eltA, 2 + store i8 %eltA, i8* %addrA + %iv.next = add i64 %iv, 1 + %latch.check = icmp sge i64 %iv.next, 2 + br i1 %latch.check, label %loop, label %exit + +exit: + ret i64 %iv +} -- cgit v1.2.3 From dc666ea9df629f7b5ec1506993f15d406a52acc6 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 2 Nov 2017 21:35:37 +0000 Subject: Clean up comments in include/llvm-c/DebugInfo.h Patch by Harlan Haskins! Differential Revision: https://reviews.llvm.org/D39568 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317271 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm-c/DebugInfo.h | 143 ++++++++++++++++++++++++++------------------- 1 file changed, 84 insertions(+), 59 deletions(-) diff --git a/include/llvm-c/DebugInfo.h b/include/llvm-c/DebugInfo.h index 2c2fdbdf173..a27b351577a 100644 --- a/include/llvm-c/DebugInfo.h +++ b/include/llvm-c/DebugInfo.h @@ -23,7 +23,9 @@ extern "C" { #endif -/// Debug info flags. +/** + * Debug info flags. + */ typedef enum { LLVMDIFlagZero = 0, LLVMDIFlagPrivate = 1, @@ -58,7 +60,9 @@ typedef enum { LLVMDIFlagVirtualInheritance } LLVMDIFlags; -/// Source languages known by DWARF. +/** + * Source languages known by DWARF. + */ typedef enum { LLVMDWARFSourceLanguageC89, LLVMDWARFSourceLanguageC, @@ -106,68 +110,85 @@ typedef enum { LLVMDWARFSourceLanguageBORLAND_Delphi } LLVMDWARFSourceLanguage; -/// The amount of debug information to emit. +/** + * The amount of debug information to emit. + */ typedef enum { LLVMDWARFEmissionNone = 0, LLVMDWARFEmissionFull, LLVMDWARFEmissionLineTablesOnly } LLVMDWARFEmissionKind; -/// The current debug metadata version number. +/** + * The current debug metadata version number. + */ unsigned LLVMDebugMetadataVersion(void); -/// The version of debug metadata that's present in the provided \c Module. +/** + * The version of debug metadata that's present in the provided \c Module. + */ unsigned LLVMGetModuleDebugMetadataVersion(LLVMModuleRef Module); -/// Strip debug info in the module if it exists. -/// -/// To do this, we remove all calls to the debugger intrinsics and any named -/// metadata for debugging. We also remove debug locations for instructions. -/// Return true if module is modified. +/** + * Strip debug info in the module if it exists. + * To do this, we remove all calls to the debugger intrinsics and any named + * metadata for debugging. We also remove debug locations for instructions. + * Return true if module is modified. + */ LLVMBool LLVMStripModuleDebugInfo(LLVMModuleRef Module); -/// Construct a builder for a module, and do not allow for unresolved nodes -/// attached to the module. +/** + * Construct a builder for a module, and do not allow for unresolved nodes + * attached to the module. + */ LLVMDIBuilderRef LLVMCreateDIBuilderDisallowUnresolved(LLVMModuleRef M); -/// Construct a builder for a module and collect unresolved nodes attached -/// to the module in order to resolve cycles during a call to -/// \c LLVMDIBuilderFinalize. +/** + * Construct a builder for a module and collect unresolved nodes attached + * to the module in order to resolve cycles during a call to + * \c LLVMDIBuilderFinalize. + */ LLVMDIBuilderRef LLVMCreateDIBuilder(LLVMModuleRef M); -/// Deallocates the DIBuilder and everything it owns. -/// @note You must call \c LLVMDIBuilderFinalize before this +/** + * Deallocates the DIBuilder and everything it owns. + * @note You must call \c LLVMDIBuilderFinalize before this + */ void LLVMDisposeDIBuilder(LLVMDIBuilderRef Builder); -/// Construct any deferred debug info descriptors. +/** + * Construct any deferred debug info descriptors. + */ void LLVMDIBuilderFinalize(LLVMDIBuilderRef Builder); -/// A CompileUnit provides an anchor for all debugging -/// information generated during this instance of compilation. -/// \param Lang Source programming language, eg. -/// \c LLVMDWARFSourceLanguageC99 -/// \param FileRef File info. -/// \param Producer Identify the producer of debugging information -/// and code. Usually this is a compiler -/// version string. -/// \param ProducerLen The length of the C string passed to \c Producer. -/// \param isOptimized A boolean flag which indicates whether optimization -/// is enabled or not. -/// \param Flags This string lists command line options. This -/// string is directly embedded in debug info -/// output which may be used by a tool -/// analyzing generated debugging information. -/// \param FlagsLen The length of the C string passed to \c Flags. -/// \param RuntimeVer This indicates runtime version for languages like -/// Objective-C. -/// \param SplitName The name of the file that we'll split debug info -/// out into. -/// \param SplitNameLen The length of the C string passed to \c SplitName. -/// \param Kind The kind of debug information to generate. -/// \param DWOId The DWOId if this is a split skeleton compile unit. -/// \param SplitDebugInlining Whether to emit inline debug info. -/// \param DebugInfoForProfiling Whether to emit extra debug info for -/// profile collection. +/** + * A CompileUnit provides an anchor for all debugging + * information generated during this instance of compilation. + * \param Lang Source programming language, eg. + * \c LLVMDWARFSourceLanguageC99 + * \param FileRef File info. + * \param Producer Identify the producer of debugging information + * and code. Usually this is a compiler + * version string. + * \param ProducerLen The length of the C string passed to \c Producer. + * \param isOptimized A boolean flag which indicates whether optimization + * is enabled or not. + * \param Flags This string lists command line options. This + * string is directly embedded in debug info + * output which may be used by a tool + * analyzing generated debugging information. + * \param FlagsLen The length of the C string passed to \c Flags. + * \param RuntimeVer This indicates runtime version for languages like + * Objective-C. + * \param SplitName The name of the file that we'll split debug info + * out into. + * \param SplitNameLen The length of the C string passed to \c SplitName. + * \param Kind The kind of debug information to generate. + * \param DWOId The DWOId if this is a split skeleton compile unit. + * \param SplitDebugInlining Whether to emit inline debug info. + * \param DebugInfoForProfiling Whether to emit extra debug info for + * profile collection. + */ LLVMMetadataRef LLVMDIBuilderCreateCompileUnit( LLVMDIBuilderRef Builder, LLVMDWARFSourceLanguage Lang, LLVMMetadataRef FileRef, const char *Producer, size_t ProducerLen, @@ -176,32 +197,36 @@ LLVMMetadataRef LLVMDIBuilderCreateCompileUnit( LLVMDWARFEmissionKind Kind, unsigned DWOId, LLVMBool SplitDebugInlining, LLVMBool DebugInfoForProfiling); -/// Create a file descriptor to hold debugging information for a file. -/// \param Builder The DIBuilder. -/// \param Filename File name. -/// \param FilenameLen The length of the C string passed to \c Filename. -/// \param Directory Directory. -/// \param DirectoryLen The length of the C string passed to \c Directory. +/** + * Create a file descriptor to hold debugging information for a file. + * \param Builder The DIBuilder. + * \param Filename File name. + * \param FilenameLen The length of the C string passed to \c Filename. + * \param Directory Directory. + * \param DirectoryLen The length of the C string passed to \c Directory. + */ LLVMMetadataRef LLVMDIBuilderCreateFile(LLVMDIBuilderRef Builder, const char *Filename, size_t FilenameLen, const char *Directory, size_t DirectoryLen); -/// Creates a new DebugLocation that describes a source location. -/// \param Line The line in the source file. -/// \param Column The column in the source file. -/// \param Scope The scope in which the location resides. -/// \param InlinedAt The scope where this location was inlined, if at all. -/// (optional). -/// \note If the item to which this location is attached cannot be -/// attributed to a source line, pass 0 for the line and column. +/** + * Creates a new DebugLocation that describes a source location. + * \param Line The line in the source file. + * \param Column The column in the source file. + * \param Scope The scope in which the location resides. + * \param InlinedAt The scope where this location was inlined, if at all. + * (optional). + * \note If the item to which this location is attached cannot be + * attributed to a source line, pass 0 for the line and column. + */ LLVMMetadataRef LLVMDIBuilderCreateDebugLocation(LLVMContextRef Ctx, unsigned Line, unsigned Column, LLVMMetadataRef Scope, LLVMMetadataRef InlinedAt); #ifdef __cplusplus -} // end extern "C" +} /* end extern "C" */ #endif #endif -- cgit v1.2.3 From fbb50d9079f4281847a47d0aba5c29455237da63 Mon Sep 17 00:00:00 2001 From: Shoaib Meenai Date: Thu, 2 Nov 2017 21:43:32 +0000 Subject: [tools] Add option to install binutils symlinks The LLVM tools can be used as a replacement for binutils, in which case it's convenient to create symlinks with the binutils names. Add support for these symlinks in the build system. As with any other llvm tool symlinks, the user can limit the installed symlinks by only adding the desired ones to `LLVM_TOOLCHAIN_TOOLS`. Differential Revision: https://reviews.llvm.org/D39530 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317272 91177308-0d34-0410-b5e6-96231b3b80d8 --- CMakeLists.txt | 3 +++ docs/CMake.rst | 4 ++++ tools/llvm-ar/CMakeLists.txt | 6 ++++++ tools/llvm-cxxfilt/CMakeLists.txt | 4 ++++ tools/llvm-dwp/CMakeLists.txt | 4 ++++ tools/llvm-nm/CMakeLists.txt | 4 ++++ tools/llvm-objcopy/CMakeLists.txt | 4 ++++ tools/llvm-objdump/CMakeLists.txt | 4 ++++ tools/llvm-readobj/CMakeLists.txt | 4 ++++ tools/llvm-size/CMakeLists.txt | 4 ++++ tools/llvm-strings/CMakeLists.txt | 3 +++ tools/llvm-symbolizer/CMakeLists.txt | 4 ++++ 12 files changed, 48 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e27562dc8b5..6328f1e18c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -179,6 +179,9 @@ set(CMAKE_MODULE_PATH # for use by clang_complete, YouCompleteMe, etc. set(CMAKE_EXPORT_COMPILE_COMMANDS 1) +option(LLVM_INSTALL_BINUTILS_SYMLINKS + "Install symlinks from the binutils tool names to the corresponding LLVM tools." OFF) + option(LLVM_INSTALL_UTILS "Include utility binaries in the 'install' target." OFF) option(LLVM_INSTALL_TOOLCHAIN_ONLY "Only include toolchain files in the 'install' target." OFF) diff --git a/docs/CMake.rst b/docs/CMake.rst index 473672b5f73..05edec64da3 100644 --- a/docs/CMake.rst +++ b/docs/CMake.rst @@ -224,6 +224,10 @@ LLVM-specific variables Generate build targets for the LLVM tools. Defaults to ON. You can use this option to disable the generation of build targets for the LLVM tools. +**LLVM_INSTALL_BINUTILS_SYMLINKS**:BOOL + Install symlinks from the binutils tool names to the corresponding LLVM tools. + For example, ar will be symlinked to llvm-ar. + **LLVM_BUILD_EXAMPLES**:BOOL Build LLVM examples. Defaults to OFF. Targets for building each example are generated in any case. See documentation for *LLVM_BUILD_TOOLS* above for more diff --git a/tools/llvm-ar/CMakeLists.txt b/tools/llvm-ar/CMakeLists.txt index 731bcbd8ac9..2970a59beee 100644 --- a/tools/llvm-ar/CMakeLists.txt +++ b/tools/llvm-ar/CMakeLists.txt @@ -17,3 +17,9 @@ add_llvm_tool(llvm-ar add_llvm_tool_symlink(llvm-ranlib llvm-ar) add_llvm_tool_symlink(llvm-lib llvm-ar) add_llvm_tool_symlink(llvm-dlltool llvm-ar) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(ar llvm-ar) + add_llvm_tool_symlink(dlltool llvm-ar) + add_llvm_tool_symlink(ranlib llvm-ar) +endif() diff --git a/tools/llvm-cxxfilt/CMakeLists.txt b/tools/llvm-cxxfilt/CMakeLists.txt index 488064d08da..2a78acad80a 100644 --- a/tools/llvm-cxxfilt/CMakeLists.txt +++ b/tools/llvm-cxxfilt/CMakeLists.txt @@ -6,3 +6,7 @@ set(LLVM_LINK_COMPONENTS add_llvm_tool(llvm-cxxfilt llvm-cxxfilt.cpp ) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(c++filt llvm-cxxfilt) +endif() diff --git a/tools/llvm-dwp/CMakeLists.txt b/tools/llvm-dwp/CMakeLists.txt index 98d67e04fe6..1b5fbddc1f7 100644 --- a/tools/llvm-dwp/CMakeLists.txt +++ b/tools/llvm-dwp/CMakeLists.txt @@ -15,3 +15,7 @@ add_llvm_tool(llvm-dwp DEPENDS intrinsics_gen ) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(dwp llvm-dwp) +endif() diff --git a/tools/llvm-nm/CMakeLists.txt b/tools/llvm-nm/CMakeLists.txt index 08bcd5f3089..f093cc4328a 100644 --- a/tools/llvm-nm/CMakeLists.txt +++ b/tools/llvm-nm/CMakeLists.txt @@ -14,3 +14,7 @@ add_llvm_tool(llvm-nm DEPENDS intrinsics_gen ) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(nm llvm-nm) +endif() diff --git a/tools/llvm-objcopy/CMakeLists.txt b/tools/llvm-objcopy/CMakeLists.txt index 18cc2075345..05aa727ab9d 100644 --- a/tools/llvm-objcopy/CMakeLists.txt +++ b/tools/llvm-objcopy/CMakeLists.txt @@ -7,3 +7,7 @@ add_llvm_tool(llvm-objcopy llvm-objcopy.cpp Object.cpp ) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(objcopy llvm-objcopy) +endif() diff --git a/tools/llvm-objdump/CMakeLists.txt b/tools/llvm-objdump/CMakeLists.txt index 27e6145dfc1..043a181d639 100644 --- a/tools/llvm-objdump/CMakeLists.txt +++ b/tools/llvm-objdump/CMakeLists.txt @@ -25,3 +25,7 @@ add_llvm_tool(llvm-objdump if(HAVE_LIBXAR) target_link_libraries(llvm-objdump ${XAR_LIB}) endif() + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(objdump llvm-objdump) +endif() diff --git a/tools/llvm-readobj/CMakeLists.txt b/tools/llvm-readobj/CMakeLists.txt index 54471674173..dafc9e10cfa 100644 --- a/tools/llvm-readobj/CMakeLists.txt +++ b/tools/llvm-readobj/CMakeLists.txt @@ -23,3 +23,7 @@ add_llvm_tool(llvm-readobj ) add_llvm_tool_symlink(llvm-readelf llvm-readobj) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(readelf llvm-readobj) +endif() diff --git a/tools/llvm-size/CMakeLists.txt b/tools/llvm-size/CMakeLists.txt index 60345739c35..7ef4f1769b8 100644 --- a/tools/llvm-size/CMakeLists.txt +++ b/tools/llvm-size/CMakeLists.txt @@ -6,3 +6,7 @@ set(LLVM_LINK_COMPONENTS add_llvm_tool(llvm-size llvm-size.cpp ) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(size llvm-size) +endif() diff --git a/tools/llvm-strings/CMakeLists.txt b/tools/llvm-strings/CMakeLists.txt index 9339892a499..390f1175139 100644 --- a/tools/llvm-strings/CMakeLists.txt +++ b/tools/llvm-strings/CMakeLists.txt @@ -8,3 +8,6 @@ add_llvm_tool(llvm-strings llvm-strings.cpp ) +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(strings llvm-strings) +endif() diff --git a/tools/llvm-symbolizer/CMakeLists.txt b/tools/llvm-symbolizer/CMakeLists.txt index b04c45ff744..d9b05208afd 100644 --- a/tools/llvm-symbolizer/CMakeLists.txt +++ b/tools/llvm-symbolizer/CMakeLists.txt @@ -14,3 +14,7 @@ set(LLVM_LINK_COMPONENTS add_llvm_tool(llvm-symbolizer llvm-symbolizer.cpp ) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(addr2line llvm-symbolizer) +endif() -- cgit v1.2.3 From da35e5e8bec2e0110f896b4ef677445187c7ab42 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Thu, 2 Nov 2017 21:56:59 +0000 Subject: [Hexagon] Prefer L2_loadrub_io over L4_loadrub_rr If the offset is an immediate, avoid putting it in a register to get Rs+Rt<<#0. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317275 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Hexagon/HexagonPatterns.td | 134 +++++++++++++++++++++------------- test/CodeGen/Hexagon/isel-prefer.ll | 10 +++ 2 files changed, 92 insertions(+), 52 deletions(-) diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index d432bfef7ae..05865c43f2d 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -1706,28 +1706,27 @@ multiclass Loadxim_pat; } -// Patterns to select load reg reg-indexed: Rs + Rt< { - let AddedComplexity = 40 in - def: Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))), - (VT (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2))>; - - let AddedComplexity = 20 in - def: Pat<(VT (Load (add I32:$Rs, I32:$Rt))), - (VT (MI IntRegs:$Rs, IntRegs:$Rt, 0))>; -} - -// Patterns to select load reg reg-indexed: Rs + Rt< { - let AddedComplexity = 40 in - def: Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))), - (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2)))>; +// Pattern to select load reg reg-indexed: Rs + Rt< + : Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))), + (VT (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2))>; + +// Pattern to select load reg reg-indexed: Rs + Rt<<0. +class Loadxr_add_pat + : Pat<(VT (Load (add I32:$Rs, I32:$Rt))), + (VT (MI IntRegs:$Rs, IntRegs:$Rt, 0))>; + +// Pattern to select load reg reg-indexed: Rs + Rt< + : Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))), + (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2)))>; - let AddedComplexity = 20 in - def: Pat<(VT (Load (add I32:$Rs, I32:$Rt))), - (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, 0)))>; -} +// Pattern to select load reg reg-indexed: Rs + Rt<<0 with value modifier. +class Loadxrm_add_pat + : Pat<(VT (Load (add I32:$Rs, I32:$Rt))), + (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, 0)))>; // Pattern to select load long-offset reg-indexed: Addr + Rt<; } -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; +let AddedComplexity = 30 in { + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; +} let AddedComplexity = 60 in { def: Loadxu_pat; @@ -1818,26 +1819,55 @@ let AddedComplexity = 60 in { def: Loadxum_pat; } -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; - -defm: Loadxrm_pat; -defm: Loadxrm_pat; -defm: Loadxrm_pat; -defm: Loadxrm_pat; -defm: Loadxrm_pat; -defm: Loadxrm_pat; -defm: Loadxrm_pat; -defm: Loadxrm_pat; -defm: Loadxrm_pat; +let AddedComplexity = 40 in { + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; +} + +let AddedComplexity = 20 in { + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; +} + +let AddedComplexity = 40 in { + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; +} + +let AddedComplexity = 20 in { + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; +} // Absolute address diff --git a/test/CodeGen/Hexagon/isel-prefer.ll b/test/CodeGen/Hexagon/isel-prefer.ll index 062b0b3a0ea..7094544f54b 100644 --- a/test/CodeGen/Hexagon/isel-prefer.ll +++ b/test/CodeGen/Hexagon/isel-prefer.ll @@ -54,4 +54,14 @@ b2: ret i32 %v6 } +; CHECK-LABEL: Prefer_L2_loadrub_io: +; CHECK: memub(r0+#65) +define i64 @Prefer_L2_loadrub_io(i8* %a0) #0 { +b1: + %v2 = getelementptr i8, i8* %a0, i32 65 + %v3 = load i8, i8* %v2 + %v4 = zext i8 %v3 to i64 + ret i64 %v4 +} + attributes #0 = { nounwind readnone } -- cgit v1.2.3 From dd33e177dd838793692d7a291dc5552e30642842 Mon Sep 17 00:00:00 2001 From: Hiroshi Yamauchi Date: Thu, 2 Nov 2017 22:26:51 +0000 Subject: Irreducible loop metadata for more accurate block frequency under PGO. Summary: Currently the block frequency analysis is an approximation for irreducible loops. The new irreducible loop metadata is used to annotate the irreducible loop headers with their header weights based on the PGO profile (currently this is approximated to be evenly weighted) and to help improve the accuracy of the block frequency analysis for irreducible loops. This patch is a basic support for this. Reviewers: davidxl Reviewed By: davidxl Subscribers: mehdi_amini, llvm-commits, eraman Differential Revision: https://reviews.llvm.org/D39028 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317278 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/LangRef.rst | 23 +++ include/llvm/Analysis/BlockFrequencyInfo.h | 4 + include/llvm/Analysis/BlockFrequencyInfoImpl.h | 49 ++++- include/llvm/CodeGen/MachineBasicBlock.h | 10 + include/llvm/CodeGen/MachineBlockFrequencyInfo.h | 2 + include/llvm/IR/BasicBlock.h | 2 + include/llvm/IR/LLVMContext.h | 1 + include/llvm/IR/MDBuilder.h | 3 + include/llvm/Transforms/PGOInstrumentation.h | 2 + lib/Analysis/BlockFrequencyInfo.cpp | 5 + lib/Analysis/BlockFrequencyInfoImpl.cpp | 21 +++ lib/CodeGen/MachineBasicBlock.cpp | 8 + lib/CodeGen/MachineBlockFrequencyInfo.cpp | 6 + lib/IR/BasicBlock.cpp | 13 ++ lib/IR/LLVMContext.cpp | 1 + lib/IR/MDBuilder.cpp | 7 + .../Instrumentation/PGOInstrumentation.cpp | 28 ++- .../Analysis/BlockFrequencyInfo/irreducible_pgo.ll | 208 +++++++++++++++++++++ test/ThinLTO/X86/lazyload_metadata.ll | 4 +- .../PGOProfile/Inputs/irreducible.proftext | 29 +++ test/Transforms/PGOProfile/irreducible.ll | 184 ++++++++++++++++++ 21 files changed, 600 insertions(+), 10 deletions(-) create mode 100644 test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll create mode 100644 test/Transforms/PGOProfile/Inputs/irreducible.proftext create mode 100644 test/Transforms/PGOProfile/irreducible.ll diff --git a/docs/LangRef.rst b/docs/LangRef.rst index 9d910568bd5..6823fe5fcd7 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -5194,6 +5194,29 @@ the loop identifier metadata node directly: !1 = !{!1} ; an identifier for the inner loop !2 = !{!2} ; an identifier for the outer loop +'``irr_loop``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^ + +``irr_loop`` metadata may be attached to the terminator instruction of a basic +block that's an irreducible loop header (note that an irreducible loop has more +than once header basic blocks.) If ``irr_loop`` metadata is attached to the +terminator instruction of a basic block that is not really an irreducible loop +header, the behavior is undefined. The intent of this metadata is to improve the +accuracy of the block frequency propagation. For example, in the code below, the +block ``header0`` may have a loop header weight (relative to the other headers of +the irreducible loop) of 100: + +.. code-block:: llvm + + header0: + ... + br i1 %cmp, label %t1, label %t2, !irr_loop !0 + + ... + !0 = !{"loop_header_weight", i64 100} + +Irreducible loop header weights are typically based on profile data. + '``invariant.group``' Metadata ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/include/llvm/Analysis/BlockFrequencyInfo.h b/include/llvm/Analysis/BlockFrequencyInfo.h index d663b09d5cf..89370cbeeea 100644 --- a/include/llvm/Analysis/BlockFrequencyInfo.h +++ b/include/llvm/Analysis/BlockFrequencyInfo.h @@ -75,6 +75,10 @@ public: /// the enclosing function's count (if available) and returns the value. Optional getProfileCountFromFreq(uint64_t Freq) const; + /// \brief Returns true if \p BB is an irreducible loop header + /// block. Otherwise false. + bool isIrrLoopHeader(const BasicBlock *BB); + // Set the frequency of the given basic block. void setBlockFreq(const BasicBlock *BB, uint64_t Freq); diff --git a/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/include/llvm/Analysis/BlockFrequencyInfoImpl.h index 7f166f4a646..7b916e3653b 100644 --- a/include/llvm/Analysis/BlockFrequencyInfoImpl.h +++ b/include/llvm/Analysis/BlockFrequencyInfoImpl.h @@ -20,6 +20,7 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/Twine.h" #include "llvm/ADT/iterator_range.h" #include "llvm/IR/BasicBlock.h" @@ -414,6 +415,10 @@ public: /// \brief Data about each block. This is used downstream. std::vector Freqs; + /// \brief Whether each block is an irreducible loop header. + /// This is used downstream. + SparseBitVector<> IsIrrLoopHeader; + /// \brief Loop data: see initializeLoops(). std::vector Working; @@ -492,6 +497,8 @@ public: /// the backedges going into each of the loop headers. void adjustLoopHeaderMass(LoopData &Loop); + void distributeIrrLoopHeaderMass(Distribution &Dist); + /// \brief Package up a loop. void packageLoop(LoopData &Loop); @@ -520,6 +527,7 @@ public: const BlockNode &Node) const; Optional getProfileCountFromFreq(const Function &F, uint64_t Freq) const; + bool isIrrLoopHeader(const BlockNode &Node); void setBlockFreq(const BlockNode &Node, uint64_t Freq); @@ -973,6 +981,10 @@ public: return BlockFrequencyInfoImplBase::getProfileCountFromFreq(F, Freq); } + bool isIrrLoopHeader(const BlockT *BB) { + return BlockFrequencyInfoImplBase::isIrrLoopHeader(getNode(BB)); + } + void setBlockFreq(const BlockT *BB, uint64_t Freq); Scaled64 getFloatingBlockFreq(const BlockT *BB) const { @@ -1140,17 +1152,39 @@ bool BlockFrequencyInfoImpl::computeMassInLoop(LoopData &Loop) { DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n"); if (Loop.isIrreducible()) { - BlockMass Remaining = BlockMass::getFull(); + DEBUG(dbgs() << "isIrreducible = true\n"); + Distribution Dist; + unsigned NumHeadersWithWeight = 0; for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { - auto &Mass = Working[Loop.Nodes[H].Index].getMass(); - Mass = Remaining * BranchProbability(1, Loop.NumHeaders - H); - Remaining -= Mass; + auto &HeaderNode = Loop.Nodes[H]; + const BlockT *Block = getBlock(HeaderNode); + IsIrrLoopHeader.set(Loop.Nodes[H].Index); + Optional HeaderWeight = Block->getIrrLoopHeaderWeight(); + if (!HeaderWeight) + continue; + DEBUG(dbgs() << getBlockName(HeaderNode) + << " has irr loop header weight " << HeaderWeight.getValue() + << "\n"); + NumHeadersWithWeight++; + uint64_t HeaderWeightValue = HeaderWeight.getValue(); + if (HeaderWeightValue) + Dist.addLocal(HeaderNode, HeaderWeightValue); } + if (NumHeadersWithWeight != Loop.NumHeaders) { + // Not all headers have a weight metadata. Distribute weight evenly. + Dist = Distribution(); + for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { + auto &HeaderNode = Loop.Nodes[H]; + Dist.addLocal(HeaderNode, 1); + } + } + distributeIrrLoopHeaderMass(Dist); for (const BlockNode &M : Loop.Nodes) if (!propagateMassToSuccessors(&Loop, M)) llvm_unreachable("unhandled irreducible control flow"); - - adjustLoopHeaderMass(Loop); + if (NumHeadersWithWeight != Loop.NumHeaders) + // Not all headers have a weight metadata. Adjust header mass. + adjustLoopHeaderMass(Loop); } else { Working[Loop.getHeader().Index].getMass() = BlockMass::getFull(); if (!propagateMassToSuccessors(&Loop, Loop.getHeader())) @@ -1285,6 +1319,9 @@ raw_ostream &BlockFrequencyInfoImpl::print(raw_ostream &OS) const { BlockFrequencyInfoImplBase::getBlockProfileCount( *F->getFunction(), getNode(&BB))) OS << ", count = " << ProfileCount.getValue(); + if (Optional IrrLoopHeaderWeight = + BB.getIrrLoopHeaderWeight()) + OS << ", irr_loop_header_weight = " << IrrLoopHeaderWeight.getValue(); OS << "\n"; } diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h index 51a0d96deda..0f5b04d9045 100644 --- a/include/llvm/CodeGen/MachineBasicBlock.h +++ b/include/llvm/CodeGen/MachineBasicBlock.h @@ -97,6 +97,8 @@ private: using const_probability_iterator = std::vector::const_iterator; + Optional IrrLoopHeaderWeight; + /// Keep track of the physical registers that are livein of the basicblock. using LiveInVector = std::vector; LiveInVector LiveIns; @@ -729,6 +731,14 @@ public: /// Return the MCSymbol for this basic block. MCSymbol *getSymbol() const; + Optional getIrrLoopHeaderWeight() const { + return IrrLoopHeaderWeight; + } + + void setIrrLoopHeaderWeight(uint64_t Weight) { + IrrLoopHeaderWeight = Weight; + } + private: /// Return probability iterator corresponding to the I successor iterator. probability_iterator getProbabilityIterator(succ_iterator I); diff --git a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h index cba79c818a7..5b4b99ca0a5 100644 --- a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h +++ b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h @@ -62,6 +62,8 @@ public: Optional getBlockProfileCount(const MachineBasicBlock *MBB) const; Optional getProfileCountFromFreq(uint64_t Freq) const; + bool isIrrLoopHeader(const MachineBasicBlock *MBB); + const MachineFunction *getFunction() const; const MachineBranchProbabilityInfo *getMBPI() const; void view(const Twine &Name, bool isSimple = true) const; diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h index 6714f2c9747..77cfc9776df 100644 --- a/include/llvm/IR/BasicBlock.h +++ b/include/llvm/IR/BasicBlock.h @@ -398,6 +398,8 @@ public: /// \brief Return true if it is legal to hoist instructions into this block. bool isLegalToHoistInto() const; + Optional getIrrLoopHeaderWeight() const; + private: /// \brief Increment the internal refcount of the number of BlockAddresses /// referencing this BasicBlock by \p Amt. diff --git a/include/llvm/IR/LLVMContext.h b/include/llvm/IR/LLVMContext.h index 9e935823c77..a95634d32c2 100644 --- a/include/llvm/IR/LLVMContext.h +++ b/include/llvm/IR/LLVMContext.h @@ -101,6 +101,7 @@ public: MD_absolute_symbol = 21, // "absolute_symbol" MD_associated = 22, // "associated" MD_callees = 23, // "callees" + MD_irr_loop = 24, // "irr_loop" }; /// Known operand bundle tag IDs, which always have the same value. All diff --git a/include/llvm/IR/MDBuilder.h b/include/llvm/IR/MDBuilder.h index d679cef95b6..15c1b9cb60e 100644 --- a/include/llvm/IR/MDBuilder.h +++ b/include/llvm/IR/MDBuilder.h @@ -173,6 +173,9 @@ public: /// base type, access type and offset relative to the base type. MDNode *createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType, uint64_t Offset, bool IsConstant = false); + + /// \brief Return metadata containing an irreducible loop header weight. + MDNode *createIrrLoopHeaderWeight(uint64_t Weight); }; } // end namespace llvm diff --git a/include/llvm/Transforms/PGOInstrumentation.h b/include/llvm/Transforms/PGOInstrumentation.h index fa7a68624ec..c2cc76c422d 100644 --- a/include/llvm/Transforms/PGOInstrumentation.h +++ b/include/llvm/Transforms/PGOInstrumentation.h @@ -68,6 +68,8 @@ public: void setProfMetadata(Module *M, Instruction *TI, ArrayRef EdgeCounts, uint64_t MaxCount); +void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count); + } // end namespace llvm #endif // LLVM_TRANSFORMS_PGOINSTRUMENTATION_H diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp index 5d2170dcf15..41c29589521 100644 --- a/lib/Analysis/BlockFrequencyInfo.cpp +++ b/lib/Analysis/BlockFrequencyInfo.cpp @@ -218,6 +218,11 @@ BlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const { return BFI->getProfileCountFromFreq(*getFunction(), Freq); } +bool BlockFrequencyInfo::isIrrLoopHeader(const BasicBlock *BB) { + assert(BFI && "Expected analysis to be available"); + return BFI->isIrrLoopHeader(BB); +} + void BlockFrequencyInfo::setBlockFreq(const BasicBlock *BB, uint64_t Freq) { assert(BFI && "Expected analysis to be available"); BFI->setBlockFreq(BB, Freq); diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp index 1030407b766..7e323022d9c 100644 --- a/lib/Analysis/BlockFrequencyInfoImpl.cpp +++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp @@ -271,6 +271,7 @@ void BlockFrequencyInfoImplBase::clear() { // Swap with a default-constructed std::vector, since std::vector<>::clear() // does not actually clear heap storage. std::vector().swap(Freqs); + IsIrrLoopHeader.clear(); std::vector().swap(Working); Loops.clear(); } @@ -280,8 +281,10 @@ void BlockFrequencyInfoImplBase::clear() { /// Releases all memory not used downstream. In particular, saves Freqs. static void cleanup(BlockFrequencyInfoImplBase &BFI) { std::vector SavedFreqs(std::move(BFI.Freqs)); + SparseBitVector<> SavedIsIrrLoopHeader(std::move(BFI.IsIrrLoopHeader)); BFI.clear(); BFI.Freqs = std::move(SavedFreqs); + BFI.IsIrrLoopHeader = std::move(SavedIsIrrLoopHeader); } bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist, @@ -572,6 +575,13 @@ BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F, return BlockCount.getLimitedValue(); } +bool +BlockFrequencyInfoImplBase::isIrrLoopHeader(const BlockNode &Node) { + if (!Node.isValid()) + return false; + return IsIrrLoopHeader.test(Node.Index); +} + Scaled64 BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const { if (!Node.isValid()) @@ -819,3 +829,14 @@ void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) { DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); } } + +void BlockFrequencyInfoImplBase::distributeIrrLoopHeaderMass(Distribution &Dist) { + BlockMass LoopMass = BlockMass::getFull(); + DitheringDistributer D(Dist, LoopMass); + for (const Weight &W : Dist.Weights) { + BlockMass Taken = D.takeMass(W.Amount); + assert(W.Type == Weight::Local && "all weights should be local"); + Working[W.TargetNode.Index].getMass() = Taken; + DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); + } +} diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index d5758da0464..d65916f4966 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -42,6 +42,8 @@ using namespace llvm; MachineBasicBlock::MachineBasicBlock(MachineFunction &MF, const BasicBlock *B) : BB(B), Number(-1), xParent(&MF) { Insts.Parent = this; + if (B) + IrrLoopHeaderWeight = B->getIrrLoopHeaderWeight(); } MachineBasicBlock::~MachineBasicBlock() { @@ -338,6 +340,12 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, } OS << '\n'; } + if (IrrLoopHeaderWeight) { + if (Indexes) OS << '\t'; + OS << " Irreducible loop header weight: " + << IrrLoopHeaderWeight.getValue(); + OS << '\n'; + } } void MachineBasicBlock::printAsOperand(raw_ostream &OS, diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp index 14cd91206d8..2c336e45056 100644 --- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp +++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp @@ -234,6 +234,12 @@ MachineBlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const { return MBFI ? MBFI->getProfileCountFromFreq(*F, Freq) : None; } +bool +MachineBlockFrequencyInfo::isIrrLoopHeader(const MachineBasicBlock *MBB) { + assert(MBFI && "Expected analysis to be available"); + return MBFI->isIrrLoopHeader(MBB); +} + const MachineFunction *MachineBlockFrequencyInfo::getFunction() const { return MBFI ? MBFI->getFunction() : nullptr; } diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp index 2b780adf6c6..22513924a96 100644 --- a/lib/IR/BasicBlock.cpp +++ b/lib/IR/BasicBlock.cpp @@ -447,3 +447,16 @@ bool BasicBlock::isLandingPad() const { const LandingPadInst *BasicBlock::getLandingPadInst() const { return dyn_cast(getFirstNonPHI()); } + +Optional BasicBlock::getIrrLoopHeaderWeight() const { + const TerminatorInst *TI = getTerminator(); + if (MDNode *MDIrrLoopHeader = + TI->getMetadata(LLVMContext::MD_irr_loop)) { + MDString *MDName = cast(MDIrrLoopHeader->getOperand(0)); + if (MDName->getString().equals("loop_header_weight")) { + auto *CI = mdconst::extract(MDIrrLoopHeader->getOperand(1)); + return Optional(CI->getValue().getZExtValue()); + } + } + return Optional(); +} diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp index a94da5452b8..c8b7c10a9a4 100644 --- a/lib/IR/LLVMContext.cpp +++ b/lib/IR/LLVMContext.cpp @@ -60,6 +60,7 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) { {MD_absolute_symbol, "absolute_symbol"}, {MD_associated, "associated"}, {MD_callees, "callees"}, + {MD_irr_loop, "irr_loop"}, }; for (auto &MDKind : MDKinds) { diff --git a/lib/IR/MDBuilder.cpp b/lib/IR/MDBuilder.cpp index 54783e884e9..d8e64db7c5d 100644 --- a/lib/IR/MDBuilder.cpp +++ b/lib/IR/MDBuilder.cpp @@ -197,3 +197,10 @@ MDNode *MDBuilder::createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType, } return MDNode::get(Context, {BaseType, AccessType, createConstant(Off)}); } + +MDNode *MDBuilder::createIrrLoopHeaderWeight(uint64_t Weight) { + SmallVector Vals(2); + Vals[0] = createString("loop_header_weight"); + Vals[1] = createConstant(ConstantInt::get(Type::getInt64Ty(Context), Weight)); + return MDNode::get(Context, Vals); +} diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 11a43e803a9..c92d48396c8 100644 --- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -844,8 +844,9 @@ public: PGOUseFunc(Function &Func, Module *Modu, std::unordered_multimap &ComdatMembers, BranchProbabilityInfo *BPI = nullptr, - BlockFrequencyInfo *BFI = nullptr) - : F(Func), M(Modu), FuncInfo(Func, ComdatMembers, false, BPI, BFI), + BlockFrequencyInfo *BFIin = nullptr) + : F(Func), M(Modu), BFI(BFIin), + FuncInfo(Func, ComdatMembers, false, BPI, BFIin), FreqAttr(FFA_Normal) {} // Read counts for the instrumented BB from profile. @@ -863,6 +864,9 @@ public: // Annotate the value profile call sites for one value kind. void annotateValueSites(uint32_t Kind); + // Annotate the irreducible loop header weights. + void annotateIrrLoopHeaderWeights(); + // The hotness of the function from the profile count. enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot }; @@ -894,6 +898,7 @@ public: private: Function &F; Module *M; + BlockFrequencyInfo *BFI; // This member stores the shared information with class PGOGenFunc. FuncPGOInstrumentation FuncInfo; @@ -1183,6 +1188,18 @@ void PGOUseFunc::setBranchWeights() { } } +void PGOUseFunc::annotateIrrLoopHeaderWeights() { + DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n"); + // Find irr loop headers + for (auto &BB : F) { + if (BFI->isIrrLoopHeader(&BB)) { + TerminatorInst *TI = BB.getTerminator(); + const UseBBInfo &BBCountInfo = getBBInfo(&BB); + setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue); + } + } +} + void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) { Module *M = F.getParent(); IRBuilder<> Builder(&SI); @@ -1441,6 +1458,7 @@ static bool annotateAllFunctions( Func.populateCounters(); Func.setBranchWeights(); Func.annotateValueSites(); + Func.annotateIrrLoopHeaderWeights(); PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr(); if (FreqAttr == PGOUseFunc::FFA_Cold) ColdFunctions.push_back(&F); @@ -1582,6 +1600,12 @@ void llvm::setProfMetadata(Module *M, Instruction *TI, namespace llvm { +void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count) { + MDBuilder MDB(M->getContext()); + TI->setMetadata(llvm::LLVMContext::MD_irr_loop, + MDB.createIrrLoopHeaderWeight(Count)); +} + template <> struct GraphTraits { using NodeRef = const BasicBlock *; using ChildIteratorType = succ_const_iterator; diff --git a/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll b/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll new file mode 100644 index 00000000000..0a580276d95 --- /dev/null +++ b/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll @@ -0,0 +1,208 @@ +; RUN: opt < %s -analyze -block-freq | FileCheck %s +; RUN: opt < %s -passes='print' -disable-output 2>&1 | FileCheck %s + +; Function Attrs: noinline norecurse nounwind readnone uwtable +define i32 @_Z11irreducibleii(i32 %iter_outer, i32 %iter_inner) local_unnamed_addr !prof !27 { +entry: + %cmp24 = icmp sgt i32 %iter_outer, 0 + br i1 %cmp24, label %for.body, label %entry.for.cond.cleanup_crit_edge, !prof !28 + +entry.for.cond.cleanup_crit_edge: ; preds = %entry + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.end, %entry.for.cond.cleanup_crit_edge + %sum.0.lcssa = phi i32 [ 0, %entry.for.cond.cleanup_crit_edge ], [ %sum.1, %for.end ] + ret i32 %sum.0.lcssa + +for.body: ; preds = %for.end, %entry + %k.026 = phi i32 [ %inc12, %for.end ], [ 0, %entry ] + %sum.025 = phi i32 [ %sum.1, %for.end ], [ 0, %entry ] + %rem23 = and i32 %k.026, 1 + %cmp1 = icmp eq i32 %rem23, 0 + br i1 %cmp1, label %entry8, label %for.cond2, !prof !29 + +for.cond2: ; preds = %if.end9, %for.body + %sum.1 = phi i32 [ %add10, %if.end9 ], [ %sum.025, %for.body ] + %i.0 = phi i32 [ %inc, %if.end9 ], [ 0, %for.body ] + %cmp3 = icmp slt i32 %i.0, %iter_inner + br i1 %cmp3, label %for.body4, label %for.end, !prof !30, !irr_loop !31 + +for.body4: ; preds = %for.cond2 + %rem5 = srem i32 %k.026, 3 + %cmp6 = icmp eq i32 %rem5, 0 + br i1 %cmp6, label %entry8, label %if.end9, !prof !32 + +entry8: ; preds = %for.body4, %for.body + %sum.2 = phi i32 [ %sum.025, %for.body ], [ %sum.1, %for.body4 ] + %i.1 = phi i32 [ 0, %for.body ], [ %i.0, %for.body4 ] + %add = add nsw i32 %sum.2, 4 + br label %if.end9, !irr_loop !33 + +if.end9: ; preds = %entry8, %for.body4 + %sum.3 = phi i32 [ %add, %entry8 ], [ %sum.1, %for.body4 ] + %i.2 = phi i32 [ %i.1, %entry8 ], [ %i.0, %for.body4 ] + %add10 = add nsw i32 %sum.3, 1 + %inc = add nsw i32 %i.2, 1 + br label %for.cond2, !irr_loop !34 + +for.end: ; preds = %for.cond2 + %inc12 = add nuw nsw i32 %k.026, 1 + %exitcond = icmp eq i32 %inc12, %iter_outer + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !35 +} + +!27 = !{!"function_entry_count", i64 1} +!28 = !{!"branch_weights", i32 1, i32 0} +!29 = !{!"branch_weights", i32 50, i32 50} +!30 = !{!"branch_weights", i32 950, i32 100} +!31 = !{!"loop_header_weight", i64 1050} +!32 = !{!"branch_weights", i32 323, i32 627} +!33 = !{!"loop_header_weight", i64 373} +!34 = !{!"loop_header_weight", i64 1000} +!35 = !{!"branch_weights", i32 1, i32 99} + +; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreducibleii': +; CHECK-NEXT: block-frequency-info: _Z11irreducibleii +; CHECK-NEXT: - entry: {{.*}} count = 1 +; CHECK-NEXT: - entry.for.cond.cleanup_crit_edge: {{.*}} count = 0 +; CHECK-NEXT: - for.cond.cleanup: {{.*}} count = 1 +; CHECK-NEXT: - for.body: {{.*}} count = 100 +; CHECK-NEXT: - for.cond2: {{.*}} count = 1050, irr_loop_header_weight = 1050 +; CHECK-NEXT: - for.body4: {{.*}} count = 950 +; CHECK-NEXT: - entry8: {{.*}} count = 373, irr_loop_header_weight = 373 +; CHECK-NEXT: - if.end9: {{.*}} count = 1000, irr_loop_header_weight = 1000 +; CHECK-NEXT: - for.end: {{.*}} count = 100 + +@targets = local_unnamed_addr global [256 x i8*] zeroinitializer, align 16 +@tracing = local_unnamed_addr global i32 0, align 4 + +; Function Attrs: noinline norecurse nounwind uwtable +define i32 @_Z11irreduciblePh(i8* nocapture readonly %p) !prof !27 { +entry: + store <2 x i8*> , <2 x i8*>* bitcast ([256 x i8*]* @targets to <2 x i8*>*), align 16 + store i8* blockaddress(@_Z11irreduciblePh, %TARGET_2), i8** getelementptr inbounds ([256 x i8*], [256 x i8*]* @targets, i64 0, i64 2), align 16 + %0 = load i32, i32* @tracing, align 4 + %tobool = icmp eq i32 %0, 0 + br label %for.cond1 + +for.cond1: ; preds = %sw.default, %entry + %p.addr.0 = phi i8* [ %p, %entry ], [ %p.addr.4, %sw.default ] + %sum.0 = phi i32 [ 0, %entry ], [ %add25, %sw.default ] + %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1 + %1 = load i8, i8* %p.addr.0, align 1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p.addr.0, i64 2 + %2 = load i8, i8* %incdec.ptr, align 1 + %conv3 = zext i8 %2 to i32 + br label %dispatch_op + +dispatch_op: ; preds = %sw.bb6, %for.cond1 + %p.addr.1 = phi i8* [ %incdec.ptr2, %for.cond1 ], [ %p.addr.2, %sw.bb6 ] + %op.0 = phi i8 [ %1, %for.cond1 ], [ 1, %sw.bb6 ] + %oparg.0 = phi i32 [ %conv3, %for.cond1 ], [ %oparg.2, %sw.bb6 ] + %sum.1 = phi i32 [ %sum.0, %for.cond1 ], [ %add7, %sw.bb6 ] + switch i8 %op.0, label %sw.default [ + i8 0, label %sw.bb + i8 1, label %dispatch_op.sw.bb6_crit_edge + i8 2, label %sw.bb15 + ], !prof !36 + +dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op + br label %sw.bb6 + +sw.bb: ; preds = %indirectgoto, %dispatch_op + %oparg.1 = phi i32 [ %oparg.0, %dispatch_op ], [ 0, %indirectgoto ] + %sum.2 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %indirectgoto ] + %add.neg = sub i32 -5, %oparg.1 + %sub = add i32 %add.neg, %sum.2 + br label %exit + +TARGET_1: ; preds = %indirectgoto + %incdec.ptr4 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2 + %3 = load i8, i8* %p.addr.5, align 1 + %conv5 = zext i8 %3 to i32 + br label %sw.bb6 + +sw.bb6: ; preds = %TARGET_1, %dispatch_op.sw.bb6_crit_edge + %p.addr.2 = phi i8* [ %incdec.ptr4, %TARGET_1 ], [ %p.addr.1, %dispatch_op.sw.bb6_crit_edge ] + %oparg.2 = phi i32 [ %conv5, %TARGET_1 ], [ %oparg.0, %dispatch_op.sw.bb6_crit_edge ] + %sum.3 = phi i32 [ %sum.7, %TARGET_1 ], [ %sum.1, %dispatch_op.sw.bb6_crit_edge ] + %mul = mul nsw i32 %oparg.2, 7 + %add7 = add nsw i32 %sum.3, %mul + %rem46 = and i32 %add7, 1 + %cmp8 = icmp eq i32 %rem46, 0 + br i1 %cmp8, label %dispatch_op, label %if.then, !prof !37, !irr_loop !38 + +if.then: ; preds = %sw.bb6 + %mul9 = mul nsw i32 %add7, 9 + br label %indirectgoto + +TARGET_2: ; preds = %indirectgoto + %incdec.ptr13 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2 + %4 = load i8, i8* %p.addr.5, align 1 + %conv14 = zext i8 %4 to i32 + br label %sw.bb15 + +sw.bb15: ; preds = %TARGET_2, %dispatch_op + %p.addr.3 = phi i8* [ %p.addr.1, %dispatch_op ], [ %incdec.ptr13, %TARGET_2 ] + %oparg.3 = phi i32 [ %oparg.0, %dispatch_op ], [ %conv14, %TARGET_2 ] + %sum.4 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %TARGET_2 ] + %add16 = add nsw i32 %oparg.3, 3 + %add17 = add nsw i32 %add16, %sum.4 + br i1 %tobool, label %if.then18, label %exit, !prof !39, !irr_loop !40 + +if.then18: ; preds = %sw.bb15 + %idx.ext = sext i32 %oparg.3 to i64 + %add.ptr = getelementptr inbounds i8, i8* %p.addr.3, i64 %idx.ext + %mul19 = mul nsw i32 %add17, 17 + br label %indirectgoto + +unknown_op: ; preds = %indirectgoto + %sub24 = add nsw i32 %sum.7, -4 + br label %sw.default + +sw.default: ; preds = %unknown_op, %dispatch_op + %p.addr.4 = phi i8* [ %p.addr.5, %unknown_op ], [ %p.addr.1, %dispatch_op ] + %sum.5 = phi i32 [ %sub24, %unknown_op ], [ %sum.1, %dispatch_op ] + %add25 = add nsw i32 %sum.5, 11 + br label %for.cond1 + +exit: ; preds = %sw.bb15, %sw.bb + %sum.6 = phi i32 [ %sub, %sw.bb ], [ %add17, %sw.bb15 ] + ret i32 %sum.6 + +indirectgoto: ; preds = %if.then18, %if.then + %add.ptr.pn = phi i8* [ %add.ptr, %if.then18 ], [ %p.addr.2, %if.then ] + %sum.7 = phi i32 [ %mul19, %if.then18 ], [ %mul9, %if.then ] + %p.addr.5 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 1 + %5 = load i8, i8* %add.ptr.pn, align 1 + %idxprom21 = zext i8 %5 to i64 + %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21 + %6 = load i8*, i8** %arrayidx22, align 8 + indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !41, !irr_loop !42 +} + +!36 = !{!"branch_weights", i32 0, i32 0, i32 201, i32 1} +!37 = !{!"branch_weights", i32 201, i32 300} +!38 = !{!"loop_header_weight", i64 501} +!39 = !{!"branch_weights", i32 100, i32 0} +!40 = !{!"loop_header_weight", i64 100} +!41 = !{!"branch_weights", i32 0, i32 1, i32 300, i32 99} +!42 = !{!"loop_header_weight", i64 400} + +; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreduciblePh': +; CHECK-NEXT: block-frequency-info: _Z11irreduciblePh +; CHECK-NEXT: - entry: {{.*}} count = 1 +; CHECK-NEXT: - for.cond1: {{.*}} count = 1 +; CHECK-NEXT: - dispatch_op: {{.*}} count = 201 +; CHECK-NEXT: - dispatch_op.sw.bb6_crit_edge: {{.*}} count = 200 +; CHECK-NEXT: - sw.bb: {{.*}} count = 0 +; CHECK-NEXT: - TARGET_1: {{.*}} count = 299 +; CHECK-NEXT: - sw.bb6: {{.*}} count = 500, irr_loop_header_weight = 501 +; CHECK-NEXT: - if.then: {{.*}} count = 299 +; CHECK-NEXT: - TARGET_2: {{.*}} count = 98 +; CHECK-NEXT: - sw.bb15: {{.*}} count = 99, irr_loop_header_weight = 100 +; CHECK-NEXT: - if.then18: {{.*}} count = 99 +; CHECK-NEXT: - unknown_op: {{.*}} count = 0 +; CHECK-NEXT: - sw.default: {{.*}} count = 0 +; CHECK-NEXT: - exit: {{.*}} count = 1 +; CHECK-NEXT: - indirectgoto: {{.*}} count = 399, irr_loop_header_weight = 400 diff --git a/test/ThinLTO/X86/lazyload_metadata.ll b/test/ThinLTO/X86/lazyload_metadata.ll index a6d46e5586a..4680e462458 100644 --- a/test/ThinLTO/X86/lazyload_metadata.ll +++ b/test/ThinLTO/X86/lazyload_metadata.ll @@ -10,13 +10,13 @@ ; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \ ; RUN: -o /dev/null -stats \ ; RUN: 2>&1 | FileCheck %s -check-prefix=LAZY -; LAZY: 53 bitcode-reader - Number of Metadata records loaded +; LAZY: 55 bitcode-reader - Number of Metadata records loaded ; LAZY: 2 bitcode-reader - Number of MDStrings loaded ; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \ ; RUN: -o /dev/null -disable-ondemand-mds-loading -stats \ ; RUN: 2>&1 | FileCheck %s -check-prefix=NOTLAZY -; NOTLAZY: 62 bitcode-reader - Number of Metadata records loaded +; NOTLAZY: 64 bitcode-reader - Number of Metadata records loaded ; NOTLAZY: 7 bitcode-reader - Number of MDStrings loaded diff --git a/test/Transforms/PGOProfile/Inputs/irreducible.proftext b/test/Transforms/PGOProfile/Inputs/irreducible.proftext new file mode 100644 index 00000000000..9b0210d9a30 --- /dev/null +++ b/test/Transforms/PGOProfile/Inputs/irreducible.proftext @@ -0,0 +1,29 @@ +:ir +_Z11irreducibleii +# Func Hash: +64451410787 +# Num Counters: +6 +# Counter Values: +1000 +950 +100 +373 +1 +0 + +_Z11irreduciblePh +# Func Hash: +104649601521 +# Num Counters: +9 +# Counter Values: +100 +300 +99 +300 +201 +1 +1 +0 +0 diff --git a/test/Transforms/PGOProfile/irreducible.ll b/test/Transforms/PGOProfile/irreducible.ll new file mode 100644 index 00000000000..37f6e206ee9 --- /dev/null +++ b/test/Transforms/PGOProfile/irreducible.ll @@ -0,0 +1,184 @@ +; RUN: llvm-profdata merge %S/Inputs/irreducible.proftext -o %t.profdata +; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE +; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE + +; GEN: $__llvm_profile_raw_version = comdat any + +; Function Attrs: noinline norecurse nounwind readnone uwtable +define i32 @_Z11irreducibleii(i32 %iter_outer, i32 %iter_inner) local_unnamed_addr #0 { +entry: + %cmp24 = icmp sgt i32 %iter_outer, 0 + br i1 %cmp24, label %for.body, label %entry.for.cond.cleanup_crit_edge + +entry.for.cond.cleanup_crit_edge: ; preds = %entry + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %entry.for.cond.cleanup_crit_edge, %for.end + %sum.0.lcssa = phi i32 [ 0, %entry.for.cond.cleanup_crit_edge ], [ %sum.1, %for.end ] + ret i32 %sum.0.lcssa + +for.body: ; preds = %entry, %for.end + %k.026 = phi i32 [ %inc12, %for.end ], [ 0, %entry ] + %sum.025 = phi i32 [ %sum.1, %for.end ], [ 0, %entry ] + %rem23 = and i32 %k.026, 1 + %cmp1 = icmp eq i32 %rem23, 0 + br i1 %cmp1, label %entry8, label %for.cond2 + +for.cond2: ; preds = %for.body, %if.end9 + %sum.1 = phi i32 [ %add10, %if.end9 ], [ %sum.025, %for.body ] + %i.0 = phi i32 [ %inc, %if.end9 ], [ 0, %for.body ] + %cmp3 = icmp slt i32 %i.0, %iter_inner + br i1 %cmp3, label %for.body4, label %for.end +; USE: br i1 %cmp3, label %for.body4, label %for.end, !prof !{{[0-9]+}}, +; USE-SAME: !irr_loop ![[FOR_COND2_IRR_LOOP:[0-9]+]] + +for.body4: ; preds = %for.cond2 + %rem5 = srem i32 %k.026, 3 + %cmp6 = icmp eq i32 %rem5, 0 + br i1 %cmp6, label %entry8, label %if.end9 + +entry8: ; preds = %for.body4, %for.body + %sum.2 = phi i32 [ %sum.025, %for.body ], [ %sum.1, %for.body4 ] + %i.1 = phi i32 [ 0, %for.body ], [ %i.0, %for.body4 ] + %add = add nsw i32 %sum.2, 4 + br label %if.end9 +; USE: br label %if.end9, +; USE-SAME: !irr_loop ![[ENTRY8_IRR_LOOP:[0-9]+]] + +if.end9: ; preds = %entry8, %for.body4 + %sum.3 = phi i32 [ %add, %entry8 ], [ %sum.1, %for.body4 ] + %i.2 = phi i32 [ %i.1, %entry8 ], [ %i.0, %for.body4 ] + %add10 = add nsw i32 %sum.3, 1 + %inc = add nsw i32 %i.2, 1 + br label %for.cond2 +; USE: br label %for.cond2, +; USE-SAME: !irr_loop ![[IF_END9_IRR_LOOP:[0-9]+]] + +for.end: ; preds = %for.cond2 + %inc12 = add nuw nsw i32 %k.026, 1 + %exitcond = icmp eq i32 %inc12, %iter_outer + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + + + +@targets = local_unnamed_addr global [256 x i8*] zeroinitializer, align 16 +@tracing = local_unnamed_addr global i32 0, align 4 + +; Function Attrs: noinline norecurse nounwind uwtable +define i32 @_Z11irreduciblePh(i8* nocapture readonly %p) { +entry: + store <2 x i8*> , <2 x i8*>* bitcast ([256 x i8*]* @targets to <2 x i8*>*), align 16 + store i8* blockaddress(@_Z11irreduciblePh, %TARGET_2), i8** getelementptr inbounds ([256 x i8*], [256 x i8*]* @targets, i64 0, i64 2), align 16 + %0 = load i32, i32* @tracing, align 4 + %tobool = icmp eq i32 %0, 0 + br label %for.cond1 + +for.cond1: ; preds = %sw.default, %entry + %p.addr.0 = phi i8* [ %p, %entry ], [ %p.addr.4, %sw.default ] + %sum.0 = phi i32 [ 0, %entry ], [ %add25, %sw.default ] + %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1 + %1 = load i8, i8* %p.addr.0, align 1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p.addr.0, i64 2 + %2 = load i8, i8* %incdec.ptr, align 1 + %conv3 = zext i8 %2 to i32 + br label %dispatch_op + +dispatch_op: ; preds = %sw.bb6, %for.cond1 + %p.addr.1 = phi i8* [ %incdec.ptr2, %for.cond1 ], [ %p.addr.2, %sw.bb6 ] + %op.0 = phi i8 [ %1, %for.cond1 ], [ 1, %sw.bb6 ] + %oparg.0 = phi i32 [ %conv3, %for.cond1 ], [ %oparg.2, %sw.bb6 ] + %sum.1 = phi i32 [ %sum.0, %for.cond1 ], [ %add7, %sw.bb6 ] + switch i8 %op.0, label %sw.default [ + i8 0, label %sw.bb + i8 1, label %dispatch_op.sw.bb6_crit_edge + i8 2, label %sw.bb15 + ] + +dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op + br label %sw.bb6 + +sw.bb: ; preds = %indirectgoto, %dispatch_op + %oparg.1 = phi i32 [ %oparg.0, %dispatch_op ], [ 0, %indirectgoto ] + %sum.2 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %indirectgoto ] + %add.neg = sub i32 -5, %oparg.1 + %sub = add i32 %add.neg, %sum.2 + br label %exit + +TARGET_1: ; preds = %indirectgoto + %incdec.ptr4 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2 + %3 = load i8, i8* %p.addr.5, align 1 + %conv5 = zext i8 %3 to i32 + br label %sw.bb6 + +sw.bb6: ; preds = %dispatch_op.sw.bb6_crit_edge, %TARGET_1 + %p.addr.2 = phi i8* [ %incdec.ptr4, %TARGET_1 ], [ %p.addr.1, %dispatch_op.sw.bb6_crit_edge ] + %oparg.2 = phi i32 [ %conv5, %TARGET_1 ], [ %oparg.0, %dispatch_op.sw.bb6_crit_edge ] + %sum.3 = phi i32 [ %sum.7, %TARGET_1 ], [ %sum.1, %dispatch_op.sw.bb6_crit_edge ] + %mul = mul nsw i32 %oparg.2, 7 + %add7 = add nsw i32 %sum.3, %mul + %rem46 = and i32 %add7, 1 + %cmp8 = icmp eq i32 %rem46, 0 + br i1 %cmp8, label %dispatch_op, label %if.then +; USE: br i1 %cmp8, label %dispatch_op, label %if.then, !prof !{{[0-9]+}}, +; USE-SAME: !irr_loop ![[SW_BB6_IRR_LOOP:[0-9]+]] + +if.then: ; preds = %sw.bb6 + %mul9 = mul nsw i32 %add7, 9 + br label %indirectgoto + +TARGET_2: ; preds = %indirectgoto + %incdec.ptr13 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2 + %4 = load i8, i8* %p.addr.5, align 1 + %conv14 = zext i8 %4 to i32 + br label %sw.bb15 + +sw.bb15: ; preds = %TARGET_2, %dispatch_op + %p.addr.3 = phi i8* [ %p.addr.1, %dispatch_op ], [ %incdec.ptr13, %TARGET_2 ] + %oparg.3 = phi i32 [ %oparg.0, %dispatch_op ], [ %conv14, %TARGET_2 ] + %sum.4 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %TARGET_2 ] + %add16 = add nsw i32 %oparg.3, 3 + %add17 = add nsw i32 %add16, %sum.4 + br i1 %tobool, label %if.then18, label %exit +; USE: br i1 %tobool, label %if.then18, label %exit, !prof !{{[0-9]+}}, +; USE-SAME: !irr_loop ![[SW_BB15_IRR_LOOP:[0-9]+]] + +if.then18: ; preds = %sw.bb15 + %idx.ext = sext i32 %oparg.3 to i64 + %add.ptr = getelementptr inbounds i8, i8* %p.addr.3, i64 %idx.ext + %mul19 = mul nsw i32 %add17, 17 + br label %indirectgoto + +unknown_op: ; preds = %indirectgoto + %sub24 = add nsw i32 %sum.7, -4 + br label %sw.default + +sw.default: ; preds = %unknown_op, %dispatch_op + %p.addr.4 = phi i8* [ %p.addr.5, %unknown_op ], [ %p.addr.1, %dispatch_op ] + %sum.5 = phi i32 [ %sub24, %unknown_op ], [ %sum.1, %dispatch_op ] + %add25 = add nsw i32 %sum.5, 11 + br label %for.cond1 + +exit: ; preds = %sw.bb15, %sw.bb + %sum.6 = phi i32 [ %sub, %sw.bb ], [ %add17, %sw.bb15 ] + ret i32 %sum.6 + +indirectgoto: ; preds = %if.then18, %if.then + %add.ptr.pn = phi i8* [ %add.ptr, %if.then18 ], [ %p.addr.2, %if.then ] + %sum.7 = phi i32 [ %mul19, %if.then18 ], [ %mul9, %if.then ] + %p.addr.5 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 1 + %5 = load i8, i8* %add.ptr.pn, align 1 + %idxprom21 = zext i8 %5 to i64 + %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21 + %6 = load i8*, i8** %arrayidx22, align 8 + indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2] +; USE: indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !{{[0-9]+}}, +; USE-SAME: !irr_loop ![[INDIRECTGOTO_IRR_LOOP:[0-9]+]] +} + +; USE: ![[FOR_COND2_IRR_LOOP]] = !{!"loop_header_weight", i64 1050} +; USE: ![[ENTRY8_IRR_LOOP]] = !{!"loop_header_weight", i64 373} +; USE: ![[IF_END9_IRR_LOOP]] = !{!"loop_header_weight", i64 1000} +; USE: ![[SW_BB6_IRR_LOOP]] = !{!"loop_header_weight", i64 501} +; USE: ![[SW_BB15_IRR_LOOP]] = !{!"loop_header_weight", i64 100} +; USE: ![[INDIRECTGOTO_IRR_LOOP]] = !{!"loop_header_weight", i64 400} -- cgit v1.2.3 From f79fab6f98f7bbecb85bd1e7bad70088f9f76e6b Mon Sep 17 00:00:00 2001 From: Konstantin Zhuravlyov Date: Thu, 2 Nov 2017 22:35:22 +0000 Subject: AMDGPU: Fix warning discovered by r317266 [-Wunused-private-field] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317280 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIMachineFunctionInfo.h | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 5f5636e119a..fed31fbf42b 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -140,7 +140,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { private: unsigned LDSWaveSpillSize = 0; - unsigned ScratchOffsetReg; unsigned NumUserSGPRs = 0; unsigned NumSystemSGPRs = 0; -- cgit v1.2.3 From 56898c124500c5871061bcdae65eb2033743438b Mon Sep 17 00:00:00 2001 From: Jake Ehrlich Date: Thu, 2 Nov 2017 23:14:55 +0000 Subject: Add feature to determine if host architecture is 64-bit in llvm-lit I have a test that I'd like to add to llvm that demands using more than 32-bits worth of address space. This test can't be run on 32-bit systems because they don't have enough address space. The host triple should be used to determine this instead of config.host_arch because on Debian systems config.host_arch is not correct. This change adds the "host-arch-is-64bit" feature to allow tests to restrict themselves to the 64-bit case. Differential Revision: https://reviews.llvm.org/D39465 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317281 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/lit.cfg.py | 3 +++ test/lit.site.cfg.py.in | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/test/lit.cfg.py b/test/lit.cfg.py index 6a5cf69b987..57dc1f07049 100644 --- a/test/lit.cfg.py +++ b/test/lit.cfg.py @@ -168,6 +168,9 @@ for arch in config.targets_to_build.split(): config.available_features.add(arch.lower() + '-registered-target') # Features +known_arches = ["x86_64", "mips64", "ppc64", "aarch64"] +if any(config.llvm_host_triple.startswith(x) for x in known_arches): + config.available_features.add("llvm-64-bits") # Others/can-execute.txt if sys.platform not in ['win32']: diff --git a/test/lit.site.cfg.py.in b/test/lit.site.cfg.py.in index 19e5cd0d3c2..efdd016e45d 100644 --- a/test/lit.site.cfg.py.in +++ b/test/lit.site.cfg.py.in @@ -29,7 +29,6 @@ config.targets_to_build = "@TARGETS_TO_BUILD@" config.native_target = "@LLVM_NATIVE_ARCH@" config.llvm_bindings = "@LLVM_BINDINGS@".split(' ') config.host_os = "@HOST_OS@" -config.host_arch = "@HOST_ARCH@" config.host_cc = "@HOST_CC@" config.host_cxx = "@HOST_CXX@" config.host_ldflags = "@HOST_LDFLAGS@" @@ -42,6 +41,7 @@ config.enable_ffi = @LLVM_ENABLE_FFI@ config.build_shared_libs = @BUILD_SHARED_LIBS@ config.link_llvm_dylib = @LLVM_LINK_LLVM_DYLIB@ config.llvm_libxml2_enabled = "@LLVM_LIBXML2_ENABLED@" +config.llvm_host_triple = '@LLVM_HOST_TRIPLE@' # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. -- cgit v1.2.3 From a555cf06835827701a43bb0528d74bfc195fdeb8 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 2 Nov 2017 23:17:06 +0000 Subject: IndVarSimplify: preserve debug information attached to widened PHI nodes. This fixes PR35015. https://bugs.llvm.org/show_bug.cgi?id=35015 Differential Revision: https://reviews.llvm.org/D39345 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317282 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/IndVarSimplify.cpp | 10 +++ .../IndVarSimplify/scev-phi-debug-info.ll | 71 ++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 test/Transforms/IndVarSimplify/scev-phi-debug-info.ll diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 9ce42a06825..abb50f27f1c 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -48,6 +48,7 @@ #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -1624,6 +1625,15 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) { if (DU.NarrowDef->use_empty()) DeadInsts.emplace_back(DU.NarrowDef); } + + // Attach any debug information to the new PHI. Since OrigPhi and WidePHI + // evaluate the same recurrence, we can just copy the debug info over. + SmallVector DbgValues; + llvm::findDbgValues(DbgValues, OrigPhi); + auto *MDPhi = MetadataAsValue::get(WidePhi->getContext(), + ValueAsMetadata::get(WidePhi)); + for (auto &DbgValue : DbgValues) + DbgValue->setOperand(0, MDPhi); return WidePhi; } diff --git a/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll b/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll new file mode 100644 index 00000000000..dc6aae8d8aa --- /dev/null +++ b/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll @@ -0,0 +1,71 @@ +; RUN: opt %s -indvars -S -o - | FileCheck %s +source_filename = "/Data/llvm/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.status = type { i32, i8* } + +@status = internal unnamed_addr global [32 x %struct.status] zeroinitializer, align 16, !dbg !0 + +define void @f0() local_unnamed_addr !dbg !20 { +entry: + tail call void @llvm.dbg.value(metadata i32 0, metadata !23, metadata !DIExpression()), !dbg !24 + br label %for.cond, !dbg !24 + +for.cond: ; preds = %for.body, %entry + ; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + ; CHECK: call void @llvm.dbg.value(metadata i64 %indvars.iv, metadata !23, metadata !DIExpression()), !dbg !24 + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + tail call void @llvm.dbg.value(metadata i32 %i.0, metadata !23, metadata !DIExpression()), !dbg !24 + %cmp = icmp slt i32 %i.0, 32, !dbg !24 + br i1 %cmp, label %for.body, label %for.end, !dbg !24 + +for.body: ; preds = %for.cond + %idxprom = sext i32 %i.0 to i64, !dbg !24 + %value = getelementptr inbounds [32 x %struct.status], [32 x %struct.status]* @status, i64 0, i64 %idxprom, i32 0, !dbg !24 + store i32 42, i32* %value, align 16, !dbg !24 + tail call void @use(i32 %i.0), !dbg !24 + %inc = add nsw i32 %i.0, 1, !dbg !24 + tail call void @llvm.dbg.value(metadata i32 %inc, metadata !23, metadata !DIExpression()), !dbg !24 + br label %for.cond, !dbg !24 + +for.end: ; preds = %for.cond + ret void, !dbg !24 +} + +declare void @use(i32) + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, metadata, metadata) #0 + +attributes #0 = { nounwind readnone speculatable } + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!16, !17, !18} +!llvm.ident = !{!19} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "status", scope: !2, file: !3, line: 5, type: !6, isLocal: true, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 6.0.0 (trunk 316001) (llvm/trunk 316171)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5) +!3 = !DIFile(filename: "x.c", directory: "/home/davide/work/llvm/build-release/bin") +!4 = !{} +!5 = !{!0} +!6 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 4096, elements: !14) +!7 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "status", file: !3, line: 2, size: 128, elements: !8) +!8 = !{!9, !11} +!9 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !7, file: !3, line: 3, baseType: !10, size: 32) +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!11 = !DIDerivedType(tag: DW_TAG_member, name: "p", scope: !7, file: !3, line: 4, baseType: !12, size: 64, offset: 64) +!12 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !13, size: 64) +!13 = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char) +!14 = !{!15} +!15 = !DISubrange(count: 32) +!16 = !{i32 2, !"Dwarf Version", i32 4} +!17 = !{i32 2, !"Debug Info Version", i32 3} +!18 = !{i32 1, !"wchar_size", i32 4} +!19 = !{!"clang version 6.0.0 (trunk 316001) (llvm/trunk 316171)"} +!20 = distinct !DISubprogram(name: "f0", scope: !3, file: !3, line: 6, type: !21, isLocal: false, isDefinition: true, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: true, unit: !2, variables: !22) +!21 = !DISubroutineType(types: !4) +!22 = !{!23} +!23 = !DILocalVariable(name: "i", scope: !20, file: !3, line: 8, type: !10) +!24 = !DILocation(line: 9, scope: !20) -- cgit v1.2.3 From 6d06c893037035a00e081b6740d977dcce8653f5 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 2 Nov 2017 23:23:37 +0000 Subject: [X86] Give AVX512VL instructions priority over their AVX equivalents. I thought we had gotten all these priority bugs worked out, but I guess not. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317283 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrSSE.td | 4 ++-- test/CodeGen/X86/avx-intrinsics-x86.ll | 26 ++++++++++++++++++-------- test/CodeGen/X86/sse-intrinsics-x86.ll | 13 +++++++++---- test/CodeGen/X86/sse2-intrinsics-x86.ll | 13 +++++++++---- 4 files changed, 38 insertions(+), 18 deletions(-) diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 451303054f5..d4676b57455 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3186,7 +3186,7 @@ let Predicates = prds in { /// sse2_fp_unop_p - SSE2 unops in vector forms. multiclass sse2_fp_unop_p opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def V#NAME#PDr : PDI opc, string OpcodeStr, SDNode OpNode, // Square root. defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, - sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX, NoVLX]>, sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>, sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index 44eb14160ee..b0cf4e3b29f 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -635,10 +635,15 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) { -; CHECK-LABEL: test_x86_avx_sqrt_pd_256: -; CHECK: # BB#0: -; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX-LABEL: test_x86_avx_sqrt_pd_256: +; AVX: # BB#0: +; AVX-NEXT: vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0] +; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vsqrtpd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x51,0xc0] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -646,10 +651,15 @@ declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) { -; CHECK-LABEL: test_x86_avx_sqrt_ps_256: -; CHECK: # BB#0: -; CHECK-NEXT: vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX-LABEL: test_x86_avx_sqrt_ps_256: +; AVX: # BB#0: +; AVX-NEXT: vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0] +; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vsqrtps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x51,0xc0] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1] ret <8 x float> %res } diff --git a/test/CodeGen/X86/sse-intrinsics-x86.ll b/test/CodeGen/X86/sse-intrinsics-x86.ll index f178e18a259..5ba9f9a2645 100644 --- a/test/CodeGen/X86/sse-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse-intrinsics-x86.ll @@ -475,10 +475,15 @@ define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) { ; SSE-NEXT: sqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x51,0xc0] ; SSE-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse_sqrt_ps: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse_sqrt_ps: +; AVX2: ## BB#0: +; AVX2-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse_sqrt_ps: +; SKX: ## BB#0: +; SKX-NEXT: vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0] +; SKX-NEXT: retl ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1] ret <4 x float> %res } diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll index d4047faad9b..dcccdbfc2e6 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -1592,10 +1592,15 @@ define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) { ; SSE-NEXT: sqrtpd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x51,0xc0] ; SSE-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse2_sqrt_pd: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse2_sqrt_pd: +; AVX2: ## BB#0: +; AVX2-NEXT: vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_sqrt_pd: +; SKX: ## BB#0: +; SKX-NEXT: vsqrtpd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x51,0xc0] +; SKX-NEXT: retl ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1] ret <2 x double> %res } -- cgit v1.2.3 From 89fd072604faffaaf81741519ecb99ecaaa3a55e Mon Sep 17 00:00:00 2001 From: Jake Ehrlich Date: Thu, 2 Nov 2017 23:24:04 +0000 Subject: [llvm-objcopy] Fix bug in how segment alignment was being handled Just aligning segment offsets to segment alignment is incorrect and also wastes more space than is needed. The requirement is that p_offset == p_addr modulo p_align *not* that p_offset == 0 modulo p_align. Generally speaking we've been using p_addr == 0 modulo p_align. In fact yaml2obj can't even produce a valid situation which causes llvm-objcopy to produce incorrect results because alignment and offset were both inherited from the sections the program header covers. This change fixes this bad behavior in llvm-objcopy. Differential Revision: https://reviews.llvm.org/D39132 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317284 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../check-addr-offset-align-binary.test | 40 +++++++++++++ .../llvm-objcopy/check-addr-offset-align.test | 67 ++++++++++++++++++++++ tools/llvm-objcopy/Object.cpp | 20 ++++++- 3 files changed, 124 insertions(+), 3 deletions(-) create mode 100644 test/tools/llvm-objcopy/check-addr-offset-align-binary.test create mode 100644 test/tools/llvm-objcopy/check-addr-offset-align.test diff --git a/test/tools/llvm-objcopy/check-addr-offset-align-binary.test b/test/tools/llvm-objcopy/check-addr-offset-align-binary.test new file mode 100644 index 00000000000..755acceeda2 --- /dev/null +++ b/test/tools/llvm-objcopy/check-addr-offset-align-binary.test @@ -0,0 +1,40 @@ +# RUN: yaml2obj %s -o %t +# RUN: llvm-objcopy -O binary %t %t2 +# RUN: od -t x1 %t2 | FileCheck %s + +!ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1000 + AddressAlign: 0x0000000000001000 + Content: "c3c3c3c3" + - Name: .data + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x1008 + AddressAlign: 0x0000000000000008 + Content: "3232" +ProgramHeaders: + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + VAddr: 0x1000 + PAddr: 0x1000 + Align: 0x1000 + Sections: + - Section: .text + - Type: PT_LOAD + Flags: [ PF_R, PF_W ] + VAddr: 0x1008 + PAddr: 0x1008 + Align: 0x1000 + Sections: + - Section: .data + +# CHECK: 0000000 c3 c3 c3 c3 00 00 00 00 32 32 diff --git a/test/tools/llvm-objcopy/check-addr-offset-align.test b/test/tools/llvm-objcopy/check-addr-offset-align.test new file mode 100644 index 00000000000..ca2367ba434 --- /dev/null +++ b/test/tools/llvm-objcopy/check-addr-offset-align.test @@ -0,0 +1,67 @@ +# RUN: yaml2obj %s -o %t +# RUN: llvm-objcopy %t %t2 +# RUN: llvm-readobj -program-headers %t2 | FileCheck %s + +!ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1000 + AddressAlign: 0x0000000000001000 + Content: "c3c3c3c3" + - Name: .data + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x1008 + AddressAlign: 0x0000000000000008 + Content: "3232" +ProgramHeaders: + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + VAddr: 0x1000 + PAddr: 0x1000 + Align: 0x1000 + Sections: + - Section: .text + - Type: PT_LOAD + Flags: [ PF_R, PF_W ] + VAddr: 0x1008 + PAddr: 0x1008 + Align: 0x1000 + Sections: + - Section: .data + +#CHECK: ProgramHeaders [ +#CHECK-NEXT: ProgramHeader { +#CHECK-NEXT: Type: PT_LOAD +#CHECK-NEXT: Offset: 0x1000 +#CHECK-NEXT: VirtualAddress: 0x1000 +#CHECK-NEXT: PhysicalAddress: 0x1000 +#CHECK-NEXT: FileSize: 4 +#CHECK-NEXT: MemSize: 4 +#CHECK-NEXT: Flags [ +#CHECK-NEXT: PF_R +#CHECK-NEXT: PF_X +#CHECK-NEXT: ] +#CHECK-NEXT: Alignment: 4096 +#CHECK-NEXT: } +#CHECK-NEXT: ProgramHeader { +#CHECK-NEXT: Type: PT_LOAD +#CHECK-NEXT: Offset: 0x1008 +#CHECK-NEXT: VirtualAddress: 0x1008 +#CHECK-NEXT: PhysicalAddress: 0x1008 +#CHECK-NEXT: FileSize: 2 +#CHECK-NEXT: MemSize: 2 +#CHECK-NEXT: Flags [ +#CHECK-NEXT: PF_R +#CHECK-NEXT: PF_W +#CHECK-NEXT: ] +#CHECK-NEXT: Alignment: 4096 +#CHECK-NEXT: } +#CHECK-NEXT:] diff --git a/tools/llvm-objcopy/Object.cpp b/tools/llvm-objcopy/Object.cpp index 22ae47f1cac..5f9864d9cc0 100644 --- a/tools/llvm-objcopy/Object.cpp +++ b/tools/llvm-objcopy/Object.cpp @@ -685,6 +685,19 @@ template void ELFObject::sortSections() { CompareSections); } +static uint64_t alignToAddr(uint64_t Offset, uint64_t Addr, uint64_t Align) { + // Calculate Diff such that (Offset + Diff) & -Align == Addr & -Align. + if (Align == 0) + Align = 1; + auto Diff = + static_cast(Addr % Align) - static_cast(Offset % Align); + // We only want to add to Offset, however, so if Diff < 0 we can add Align and + // (Offset + Diff) & -Align == Addr & -Align will still hold. + if (Diff < 0) + Diff += Align; + return Offset + Diff; +} + template void ELFObject::assignOffsets() { // We need a temporary list of segments that has a special order to it // so that we know that anytime ->ParentSegment is set that segment has @@ -728,7 +741,7 @@ template void ELFObject::assignOffsets() { Segment->Offset = Parent->Offset + Segment->OriginalOffset - Parent->OriginalOffset; } else { - Offset = alignTo(Offset, Segment->Align == 0 ? 1 : Segment->Align); + Offset = alignToAddr(Offset, Segment->VAddr, Segment->Align); Segment->Offset = Offset; } Offset = std::max(Offset, Segment->Offset + Segment->FileSize); @@ -829,8 +842,9 @@ template void BinaryObject::finalize() { uint64_t Offset = 0; for (auto &Segment : this->Segments) { - if (Segment->Type == PT_LOAD && Segment->firstSection() != nullptr) { - Offset = alignTo(Offset, Segment->Align); + if (Segment->Type == llvm::ELF::PT_LOAD && + Segment->firstSection() != nullptr) { + Offset = alignToAddr(Offset, Segment->VAddr, Segment->Align); Segment->Offset = Offset; Offset += Segment->FileSize; } -- cgit v1.2.3 From 0ae3f32f5642942bbc7ebd2f40e1b218eee51fef Mon Sep 17 00:00:00 2001 From: Puyan Lotfi Date: Thu, 2 Nov 2017 23:37:32 +0000 Subject: mir-canon: First commit. mir-canon (MIRCanonicalizerPass) is a pass designed to reorder instructions and rename operands so that two similar programs will diff more cleanly after being run through mir-canon than they would otherwise. This project is still a work in progress and there are ideas still being discussed for improving diff quality. M include/llvm/InitializePasses.h M lib/CodeGen/CMakeLists.txt M lib/CodeGen/CodeGen.cpp A lib/CodeGen/MIRCanonicalizerPass.cpp git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317285 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/InitializePasses.h | 1 + lib/CodeGen/CMakeLists.txt | 1 + lib/CodeGen/CodeGen.cpp | 1 + lib/CodeGen/MIRCanonicalizerPass.cpp | 626 +++++++++++++++++++++++++++++++++++ 4 files changed, 629 insertions(+) create mode 100644 lib/CodeGen/MIRCanonicalizerPass.cpp diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index c3ad8fe41af..8c63ab0284d 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -377,6 +377,7 @@ void initializeWinEHPreparePass(PassRegistry&); void initializeWriteBitcodePassPass(PassRegistry&); void initializeWriteThinLTOBitcodePass(PassRegistry&); void initializeXRayInstrumentationPass(PassRegistry&); +void initializeMIRCanonicalizerPass(PassRegistry &); } // end namespace llvm diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 7ec7fda4e44..2e364cd4794 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -113,6 +113,7 @@ add_llvm_library(LLVMCodeGen RegisterPressure.cpp RegisterScavenging.cpp RenameIndependentSubregs.cpp + MIRCanonicalizerPass.cpp RegisterUsageInfo.cpp RegUsageInfoCollector.cpp RegUsageInfoPropagate.cpp diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp index f4ccb4889d3..bfab865687e 100644 --- a/lib/CodeGen/CodeGen.cpp +++ b/lib/CodeGen/CodeGen.cpp @@ -99,6 +99,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeVirtRegRewriterPass(Registry); initializeWinEHPreparePass(Registry); initializeXRayInstrumentationPass(Registry); + initializeMIRCanonicalizerPass(Registry); } void LLVMInitializeCodeGen(LLVMPassRegistryRef R) { diff --git a/lib/CodeGen/MIRCanonicalizerPass.cpp b/lib/CodeGen/MIRCanonicalizerPass.cpp new file mode 100644 index 00000000000..61f9f7e2c5d --- /dev/null +++ b/lib/CodeGen/MIRCanonicalizerPass.cpp @@ -0,0 +1,626 @@ +//===-------------- MIRCanonicalizer.cpp - MIR Canonicalizer --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The purpose of this pass is to employ a canonical code transformation so +// that code compiled with slightly different IR passes can be diffed more +// effectively than otherwise. This is done by renaming vregs in a given +// LiveRange in a canonical way. This pass also does a pseudo-scheduling to +// move defs closer to their use inorder to reduce diffs caused by slightly +// different schedules. +// +// Basic Usage: +// +// llc -o - -run-pass mir-canonicalizer example.mir +// +// Reorders instructions canonically. +// Renames virtual register operands canonically. +// Strips certain MIR artifacts (optionally). +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +#include + +using namespace llvm; + +namespace llvm { +extern char &MIRCanonicalizerID; +} // namespace llvm + +#define DEBUG_TYPE "mir-canonicalizer" + +static cl::opt +CanonicalizeFunctionNumber("canon-nth-function", cl::Hidden, cl::init(~0u), + cl::value_desc("N"), + cl::desc("Function number to canonicalize.")); + +static cl::opt +CanonicalizeBasicBlockNumber("canon-nth-basicblock", cl::Hidden, cl::init(~0u), + cl::value_desc("N"), + cl::desc("BasicBlock number to canonicalize.")); + +namespace { + +class MIRCanonicalizer : public MachineFunctionPass { +public: + static char ID; + MIRCanonicalizer() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { + return "Rename register operands in a canonical ordering."; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // end anonymous namespace + +enum VRType { RSE_Reg = 0, RSE_FrameIndex, RSE_NewCandidate }; +class TypedVReg { + VRType type; + unsigned reg; + +public: + TypedVReg(unsigned reg) : type(RSE_Reg), reg(reg) {} + TypedVReg(VRType type) : type(type), reg(~0U) { + assert(type != RSE_Reg && "Expected a non-register type."); + } + + bool isReg() const { return type == RSE_Reg; } + bool isFrameIndex() const { return type == RSE_FrameIndex; } + bool isCandidate() const { return type == RSE_NewCandidate; } + + VRType getType() const { return type; } + unsigned getReg() const { + assert(this->isReg() && "Expected a virtual or physical register."); + return reg; + } +}; + +char MIRCanonicalizer::ID; + +char &llvm::MIRCanonicalizerID = MIRCanonicalizer::ID; + +INITIALIZE_PASS_BEGIN(MIRCanonicalizer, "mir-canonicalizer", + "Rename Register Operands Canonically", false, false); + +INITIALIZE_PASS_END(MIRCanonicalizer, "mir-canonicalizer", + "Rename Register Operands Canonically", false, false); + +static std::vector GetRPOList(MachineFunction &MF) { + ReversePostOrderTraversal RPOT(&*MF.begin()); + std::vector RPOList; + for (auto MBB : RPOT) { + RPOList.push_back(MBB); + } + + return RPOList; +} + +// Set a dummy vreg. We use this vregs register class to generate throw-away +// vregs that are used to skip vreg numbers so that vreg numbers line up. +static unsigned GetDummyVReg(const MachineFunction &MF) { + for (auto &MBB : MF) { + for (auto &MI : MBB) { + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + return MO.getReg(); + } + } + } + + return ~0U; +} + +static bool rescheduleCanonically(MachineBasicBlock *MBB) { + + bool Changed = false; + + // Calculates the distance of MI from the begining of its parent BB. + auto getInstrIdx = [](const MachineInstr &MI) { + unsigned i = 0; + for (auto &CurMI : *MI.getParent()) { + if (&CurMI == &MI) + return i; + i++; + } + return ~0U; + }; + + // Pre-Populate vector of instructions to reschedule so that we don't + // clobber the iterator. + std::vector Instructions; + for (auto &MI : *MBB) { + Instructions.push_back(&MI); + } + + for (auto *II : Instructions) { + if (II->getNumOperands() == 0) + continue; + + MachineOperand &MO = II->getOperand(0); + if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + + DEBUG(dbgs() << "Operand " << 0 << " of "; II->dump(); MO.dump();); + + MachineInstr *Def = II; + unsigned Distance = ~0U; + MachineInstr *UseToBringDefCloserTo = nullptr; + MachineRegisterInfo *MRI = &MBB->getParent()->getRegInfo(); + for (auto &UO : MRI->use_nodbg_operands(MO.getReg())) { + MachineInstr *UseInst = UO.getParent(); + + const unsigned DefLoc = getInstrIdx(*Def); + const unsigned UseLoc = getInstrIdx(*UseInst); + const unsigned Delta = (UseLoc - DefLoc); + + if (UseInst->getParent() != Def->getParent()) + continue; + if (DefLoc >= UseLoc) + continue; + + if (Delta < Distance) { + Distance = Delta; + UseToBringDefCloserTo = UseInst; + } + } + + const auto BBE = MBB->instr_end(); + MachineBasicBlock::iterator DefI = BBE; + MachineBasicBlock::iterator UseI = BBE; + + for (auto BBI = MBB->instr_begin(); BBI != BBE; ++BBI) { + + if (DefI != BBE && UseI != BBE) + break; + + if ((&*BBI != Def) && (&*BBI != UseToBringDefCloserTo)) + continue; + + if (&*BBI == Def) { + DefI = BBI; + continue; + } + + if (&*BBI == UseToBringDefCloserTo) { + UseI = BBI; + continue; + } + } + + if (DefI == BBE || UseI == BBE) + continue; + + DEBUG({ + dbgs() << "Splicing "; + DefI->dump(); + dbgs() << " right before: "; + UseI->dump(); + }); + + Changed = true; + MBB->splice(UseI, MBB, DefI); + } + + return Changed; +} + +/// Here we find our candidates. What makes an interesting candidate? +/// An candidate for a canonicalization tree root is normally any kind of +/// instruction that causes side effects such as a store to memory or a copy to +/// a physical register or a return instruction. We use these as an expression +/// tree root that we walk inorder to build a canonical walk which should result +/// in canoncal vreg renaming. +static std::vector populateCandidates(MachineBasicBlock *MBB) { + std::vector Candidates; + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + for (auto II = MBB->begin(), IE = MBB->end(); II != IE; ++II) { + MachineInstr *MI = &*II; + + bool DoesMISideEffect = false; + + if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg()) { + const unsigned Dst = MI->getOperand(0).getReg(); + DoesMISideEffect |= !TargetRegisterInfo::isVirtualRegister(Dst); + + for (auto UI = MRI.use_begin(Dst); UI != MRI.use_end(); ++UI) { + if (DoesMISideEffect) break; + DoesMISideEffect |= (UI->getParent()->getParent() != MI->getParent()); + } + } + + if (!MI->mayStore() && !MI->isBranch() && !DoesMISideEffect) + continue; + + DEBUG(dbgs() << "Found Candidate: "; MI->dump();); + Candidates.push_back(MI); + } + + return Candidates; +} + +void doCandidateWalk(std::vector &VRegs, + std::queue &RegQueue, + std::vector &VisitedMIs, + const MachineBasicBlock *MBB) { + + const MachineFunction &MF = *MBB->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + while (!RegQueue.empty()) { + + auto TReg = RegQueue.front(); + RegQueue.pop(); + + if (TReg.isFrameIndex()) { + DEBUG(dbgs() << "Popping frame index.\n";); + VRegs.push_back(TypedVReg(RSE_FrameIndex)); + continue; + } + + assert(TReg.isReg() && "Expected vreg or physreg."); + unsigned Reg = TReg.getReg(); + + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + DEBUG({ + dbgs() << "Popping vreg "; + MRI.def_begin(Reg)->dump(); + dbgs() << "\n"; + }); + + if (!llvm::any_of(VRegs, [&](const TypedVReg &TR) { + return TR.isReg() && TR.getReg() == Reg; + })) { + VRegs.push_back(TypedVReg(Reg)); + } + } else { + DEBUG(dbgs() << "Popping physreg.\n";); + VRegs.push_back(TypedVReg(Reg)); + continue; + } + + for (auto RI = MRI.def_begin(Reg), RE = MRI.def_end(); RI != RE; ++RI) { + MachineInstr *Def = RI->getParent(); + + if (Def->getParent() != MBB) + continue; + + if (llvm::any_of(VisitedMIs, + [&](const MachineInstr *VMI) { return Def == VMI; })) { + break; + } + + DEBUG({ + dbgs() << "\n========================\n"; + dbgs() << "Visited MI: "; + Def->dump(); + dbgs() << "BB Name: " << Def->getParent()->getName() << "\n"; + dbgs() << "\n========================\n"; + }); + VisitedMIs.push_back(Def); + for (unsigned I = 1, E = Def->getNumOperands(); I != E; ++I) { + + MachineOperand &MO = Def->getOperand(I); + if (MO.isFI()) { + DEBUG(dbgs() << "Pushing frame index.\n";); + RegQueue.push(TypedVReg(RSE_FrameIndex)); + } + + if (!MO.isReg()) + continue; + RegQueue.push(TypedVReg(MO.getReg())); + } + } + } +} + +// TODO: Work to remove this in the future. One day when we have named vregs +// we should be able to form the canonical name based on some characteristic +// we see in that point of the expression tree (like if we were to name based +// on some sort of value numbering scheme). +static void SkipVRegs(unsigned &VRegGapIndex, MachineRegisterInfo &MRI, + const TargetRegisterClass *RC) { + const unsigned VR_GAP = (++VRegGapIndex * 1000); + + DEBUG({ + dbgs() << "Adjusting per-BB VR_GAP for BB" << VRegGapIndex << " to " + << VR_GAP << "\n"; + }); + + unsigned I = MRI.createVirtualRegister(RC); + const unsigned E = (((I + VR_GAP) / VR_GAP) + 1) * VR_GAP; + while (I != E) { + I = MRI.createVirtualRegister(RC); + } +} + +static std::map +GetVRegRenameMap(const std::vector &VRegs, + const std::vector &renamedInOtherBB, + MachineRegisterInfo &MRI, + const TargetRegisterClass *RC) { + std::map VRegRenameMap; + unsigned LastRenameReg = MRI.createVirtualRegister(RC); + bool FirstCandidate = true; + + for (auto &vreg : VRegs) { + if (vreg.isFrameIndex()) { + // We skip one vreg for any frame index because there is a good chance + // (especially when comparing SelectionDAG to GlobalISel generated MIR) + // that in the other file we are just getting an incoming vreg that comes + // from a copy from a frame index. So it's safe to skip by one. + LastRenameReg = MRI.createVirtualRegister(RC); + DEBUG(dbgs() << "Skipping rename for FI " << LastRenameReg << "\n";); + continue; + } else if (vreg.isCandidate()) { + + // After the first candidate, for every subsequent candidate, we skip mod + // 10 registers so that the candidates are more likely to start at the + // same vreg number making it more likely that the canonical walk from the + // candidate insruction. We don't need to skip from the first candidate of + // the BasicBlock because we already skip ahead several vregs for each BB. + while (LastRenameReg % 10) { + if (!FirstCandidate) break; + LastRenameReg = MRI.createVirtualRegister(RC); + + DEBUG({ + dbgs() << "Skipping rename for new candidate " << LastRenameReg + << "\n"; + }); + } + FirstCandidate = false; + continue; + } else if (!TargetRegisterInfo::isVirtualRegister(vreg.getReg())) { + LastRenameReg = MRI.createVirtualRegister(RC); + DEBUG({ + dbgs() << "Skipping rename for Phys Reg " << LastRenameReg << "\n"; + }); + continue; + } + + auto Reg = vreg.getReg(); + if (llvm::find(renamedInOtherBB, Reg) != renamedInOtherBB.end()) { + DEBUG(dbgs() << "Vreg " << Reg << " already renamed in other BB.\n";); + continue; + } + + auto Rename = MRI.createVirtualRegister(MRI.getRegClass(Reg)); + LastRenameReg = Rename; + + if (VRegRenameMap.find(Reg) == VRegRenameMap.end()) { + DEBUG(dbgs() << "Mapping vreg ";); + if (MRI.reg_begin(Reg) != MRI.reg_end()) { + DEBUG(auto foo = &*MRI.reg_begin(Reg); foo->dump();); + } else { + DEBUG(dbgs() << Reg;); + } + DEBUG(dbgs() << " to ";); + if (MRI.reg_begin(Rename) != MRI.reg_end()) { + DEBUG(auto foo = &*MRI.reg_begin(Rename); foo->dump();); + } else { + DEBUG(dbgs() << Rename;); + } + DEBUG(dbgs() << "\n";); + + VRegRenameMap.insert(std::pair(Reg, Rename)); + } + } + + return VRegRenameMap; +} + +static bool doVRegRenaming(std::vector &RenamedInOtherBB, + const std::map &VRegRenameMap, + MachineRegisterInfo &MRI) { + bool Changed = false; + for (auto I = VRegRenameMap.begin(), E = VRegRenameMap.end(); I != E; ++I) { + + auto VReg = I->first; + auto Rename = I->second; + + RenamedInOtherBB.push_back(Rename); + + std::vector RenameMOs; + for (auto &MO : MRI.reg_operands(VReg)) { + RenameMOs.push_back(&MO); + } + + for (auto *MO : RenameMOs) { + Changed = true; + MO->setReg(Rename); + + if (!MO->isDef()) + MO->setIsKill(false); + } + } + + return Changed; +} + +static bool doDefKillClear(MachineBasicBlock *MBB) { + bool Changed = false; + + for (auto &MI : *MBB) { + for (auto &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef() && MO.isKill()) { + Changed = true; + MO.setIsKill(false); + } + + if (MO.isDef() && MO.isDead()) { + Changed = true; + MO.setIsDead(false); + } + } + } + + return Changed; +} + +static bool runOnBasicBlock(MachineBasicBlock *MBB, + std::vector &bbNames, + std::vector &renamedInOtherBB, + unsigned &basicBlockNum, unsigned &VRegGapIndex) { + + if (CanonicalizeBasicBlockNumber != ~0U) { + if (CanonicalizeBasicBlockNumber != basicBlockNum++) + return false; + DEBUG(dbgs() << "\n Canonicalizing BasicBlock " << MBB->getName() << "\n";); + } + + if (llvm::find(bbNames, MBB->getName()) != bbNames.end()) { + DEBUG({ + dbgs() << "Found potentially duplicate BasicBlocks: " << MBB->getName() + << "\n"; + }); + return false; + } + + DEBUG({ + dbgs() << "\n\n NEW BASIC BLOCK: " << MBB->getName() << " \n\n"; + dbgs() << "\n\n================================================\n\n"; + }); + + bool Changed = false; + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + const unsigned DummyVReg = GetDummyVReg(MF); + const TargetRegisterClass *DummyRC = + (DummyVReg == ~0U) ? nullptr : MRI.getRegClass(DummyVReg); + if (!DummyRC) return false; + + bbNames.push_back(MBB->getName()); + DEBUG(dbgs() << "\n\n NEW BASIC BLOCK: " << MBB->getName() << "\n\n";); + + DEBUG(dbgs() << "MBB Before Scheduling:\n"; MBB->dump();); + Changed |= rescheduleCanonically(MBB); + DEBUG(dbgs() << "MBB After Scheduling:\n"; MBB->dump();); + + std::vector Candidates = populateCandidates(MBB); + std::vector VisitedMIs; + std::copy(Candidates.begin(), Candidates.end(), + std::back_inserter(VisitedMIs)); + + std::vector VRegs; + for (auto candidate : Candidates) { + VRegs.push_back(TypedVReg(RSE_NewCandidate)); + + std::queue RegQueue; + + // Here we walk the vreg operands of a non-root node along our walk. + // The root nodes are the original candidates (stores normally). + // These are normally not the root nodes (except for the case of copies to + // physical registers). + for (unsigned i = 1; i < candidate->getNumOperands(); i++) { + if (candidate->mayStore() || candidate->isBranch()) + break; + + MachineOperand &MO = candidate->getOperand(i); + if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) + continue; + + DEBUG(dbgs() << "Enqueue register"; MO.dump(); dbgs() << "\n";); + RegQueue.push(TypedVReg(MO.getReg())); + } + + // Here we walk the root candidates. We start from the 0th operand because + // the root is normally a store to a vreg. + for (unsigned i = 0; i < candidate->getNumOperands(); i++) { + + if (!candidate->mayStore() && !candidate->isBranch()) + break; + + MachineOperand &MO = candidate->getOperand(i); + + // TODO: Do we want to only add vregs here? + if (!MO.isReg() && !MO.isFI()) + continue; + + DEBUG(dbgs() << "Enqueue Reg/FI"; MO.dump(); dbgs() << "\n";); + + RegQueue.push(MO.isReg() ? TypedVReg(MO.getReg()) : + TypedVReg(RSE_FrameIndex)); + } + + doCandidateWalk(VRegs, RegQueue, VisitedMIs, MBB); + } + + // If we have populated no vregs to rename then bail. + // The rest of this function does the vreg remaping. + if (VRegs.size() == 0) + return Changed; + + // Skip some vregs, so we can recon where we'll land next. + SkipVRegs(VRegGapIndex, MRI, DummyRC); + + auto VRegRenameMap = GetVRegRenameMap(VRegs, renamedInOtherBB, MRI, DummyRC); + Changed |= doVRegRenaming(renamedInOtherBB, VRegRenameMap, MRI); + Changed |= doDefKillClear(MBB); + + DEBUG(dbgs() << "Updated MachineBasicBlock:\n"; MBB->dump(); dbgs() << "\n";); + DEBUG(dbgs() << "\n\n================================================\n\n"); + return Changed; +} + +bool MIRCanonicalizer::runOnMachineFunction(MachineFunction &MF) { + + static unsigned functionNum = 0; + if (CanonicalizeFunctionNumber != ~0U) { + if (CanonicalizeFunctionNumber != functionNum++) + return false; + DEBUG(dbgs() << "\n Canonicalizing Function " << MF.getName() << "\n";); + } + + // we need a valid vreg to create a vreg type for skipping all those + // stray vreg numbers so reach alignment/canonical vreg values. + std::vector RPOList = GetRPOList(MF); + + DEBUG( + dbgs() << "\n\n NEW MACHINE FUNCTION: " << MF.getName() << " \n\n"; + dbgs() << "\n\n================================================\n\n"; + dbgs() << "Total Basic Blocks: " << RPOList.size() << "\n"; + for (auto MBB : RPOList) { + dbgs() << MBB->getName() << "\n"; + } + dbgs() << "\n\n================================================\n\n"; + ); + + std::vector BBNames; + std::vector RenamedInOtherBB; + + unsigned GapIdx = 0; + unsigned BBNum = 0; + + bool Changed = false; + + for (auto MBB : RPOList) + Changed |= runOnBasicBlock(MBB, BBNames, RenamedInOtherBB, BBNum, GapIdx); + + return Changed; +} + -- cgit v1.2.3 From 87cdca2231ed8908e603a904131c2f49c247303c Mon Sep 17 00:00:00 2001 From: Quentin Colombet Date: Thu, 2 Nov 2017 23:38:13 +0000 Subject: [AArch64][RegisterBankInfo] Add FPR16 support in value mapping. NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317286 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64GenRegisterBankInfo.def | 63 +++++++++++++---------- lib/Target/AArch64/AArch64RegisterBankInfo.cpp | 8 +-- lib/Target/AArch64/AArch64RegisterBankInfo.h | 12 ++--- 3 files changed, 48 insertions(+), 35 deletions(-) diff --git a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def index 7d2cfbeff38..8f17ae4534c 100644 --- a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def +++ b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def @@ -14,19 +14,21 @@ namespace llvm { RegisterBankInfo::PartialMapping AArch64GenRegisterBankInfo::PartMappings[]{ /* StartIdx, Length, RegBank */ - // 0: FPR 32-bit value. + // 0: FPR 16-bit value. + {0, 16, AArch64::FPRRegBank}, + // 1: FPR 32-bit value. {0, 32, AArch64::FPRRegBank}, - // 1: FPR 64-bit value. + // 2: FPR 64-bit value. {0, 64, AArch64::FPRRegBank}, - // 2: FPR 128-bit value. + // 3: FPR 128-bit value. {0, 128, AArch64::FPRRegBank}, - // 3: FPR 256-bit value. + // 4: FPR 256-bit value. {0, 256, AArch64::FPRRegBank}, - // 4: FPR 512-bit value. + // 5: FPR 512-bit value. {0, 512, AArch64::FPRRegBank}, - // 5: GPR 32-bit value. + // 6: GPR 32-bit value. {0, 32, AArch64::GPRRegBank}, - // 6: GPR 64-bit value. + // 7: GPR 64-bit value. {0, 64, AArch64::GPRRegBank}, }; @@ -37,55 +39,62 @@ RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{ {nullptr, 0}, // 3-operands instructions (all binary operations should end up with one of // those mapping). - // 1: FPR 32-bit value. <-- This must match First3OpsIdx. + // 1: FPR 16-bit value. <-- This must match First3OpsIdx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + // 4: FPR 32-bit value. <-- This must match First3OpsIdx. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, - // 4: FPR 64-bit value. + // 7: FPR 64-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, - // 7: FPR 128-bit value. + // 10: FPR 128-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, - // 10: FPR 256-bit value. + // 13: FPR 256-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, - // 13: FPR 512-bit value. + // 16: FPR 512-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, - // 16: GPR 32-bit value. + // 19: GPR 32-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, - // 19: GPR 64-bit value. <-- This must match Last3OpsIdx. + // 22: GPR 64-bit value. <-- This must match Last3OpsIdx. {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, // Cross register bank copies. - // 22: FPR 32-bit value to GPR 32-bit value. <-- This must match + // 25: FPR 16-bit value to GPR 16-bit (invalid). <-- This must match // FirstCrossRegCpyIdx. + {nullptr, 1}, + {nullptr, 1}, + // 27: FPR 32-bit value to GPR 32-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, - // 24: FPR 64-bit value to GPR 64-bit value. + // 29: FPR 64-bit value to GPR 64-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, - // 26: FPR 128-bit value to GPR 128-bit value (invalid) + // 31: FPR 128-bit value to GPR 128-bit value (invalid) {nullptr, 1}, {nullptr, 1}, - // 28: FPR 256-bit value to GPR 256-bit value (invalid) + // 33: FPR 256-bit value to GPR 256-bit value (invalid) {nullptr, 1}, {nullptr, 1}, - // 30: FPR 512-bit value to GPR 512-bit value (invalid) + // 35: FPR 512-bit value to GPR 512-bit value (invalid) {nullptr, 1}, {nullptr, 1}, - // 32: GPR 32-bit value to FPR 32-bit value. + // 37: GPR 32-bit value to FPR 32-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, - // 34: GPR 64-bit value to FPR 64-bit value. <-- This must match + // 39: GPR 64-bit value to FPR 64-bit value. <-- This must match // LastCrossRegCpyIdx. {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, @@ -145,16 +154,18 @@ unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx, return -1; } if (RBIdx == PMI_FirstFPR) { - if (Size <= 32) + if (Size <= 16) return 0; - if (Size <= 64) + if (Size <= 32) return 1; - if (Size <= 128) + if (Size <= 64) return 2; - if (Size <= 256) + if (Size <= 128) return 3; - if (Size <= 512) + if (Size <= 256) return 4; + if (Size <= 512) + return 5; return -1; } return -1; diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index 391e8ed633d..6e246a798c5 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -87,9 +87,9 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR, {PMI_GPR32, PMI_GPR64}) && "PartialMappingIdx's are incorrectly ordered"); - assert(checkPartialMappingIdx( - PMI_FirstFPR, PMI_LastFPR, - {PMI_FPR32, PMI_FPR64, PMI_FPR128, PMI_FPR256, PMI_FPR512}) && + assert(checkPartialMappingIdx(PMI_FirstFPR, PMI_LastFPR, + {PMI_FPR16, PMI_FPR32, PMI_FPR64, PMI_FPR128, + PMI_FPR256, PMI_FPR512}) && "PartialMappingIdx's are incorrectly ordered"); // Now, the content. // Check partial mapping. @@ -102,6 +102,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR); CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR); + CHECK_PARTIALMAP(PMI_FPR16, 0, 16, RBFPR); CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR); CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR); CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR); @@ -121,6 +122,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) CHECK_VALUEMAP(GPR, 32); CHECK_VALUEMAP(GPR, 64); + CHECK_VALUEMAP(FPR, 16); CHECK_VALUEMAP(FPR, 32); CHECK_VALUEMAP(FPR, 64); CHECK_VALUEMAP(FPR, 128); diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.h b/lib/Target/AArch64/AArch64RegisterBankInfo.h index 6d74a47095a..384b9772927 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.h +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.h @@ -25,10 +25,10 @@ class TargetRegisterInfo; class AArch64GenRegisterBankInfo : public RegisterBankInfo { protected: - enum PartialMappingIdx { PMI_None = -1, - PMI_FPR32 = 1, + PMI_FPR16 = 1, + PMI_FPR32, PMI_FPR64, PMI_FPR128, PMI_FPR256, @@ -37,7 +37,7 @@ protected: PMI_GPR64, PMI_FirstGPR = PMI_GPR32, PMI_LastGPR = PMI_GPR64, - PMI_FirstFPR = PMI_FPR32, + PMI_FirstFPR = PMI_FPR16, PMI_LastFPR = PMI_FPR512, PMI_Min = PMI_FirstFPR, }; @@ -49,10 +49,10 @@ protected: enum ValueMappingIdx { InvalidIdx = 0, First3OpsIdx = 1, - Last3OpsIdx = 19, + Last3OpsIdx = 22, DistanceBetweenRegBanks = 3, - FirstCrossRegCpyIdx = 22, - LastCrossRegCpyIdx = 34, + FirstCrossRegCpyIdx = 25, + LastCrossRegCpyIdx = 39, DistanceBetweenCrossRegCpy = 2 }; -- cgit v1.2.3 From d8375d73687a9b88018cf808abbf8e639e2ad962 Mon Sep 17 00:00:00 2001 From: Quentin Colombet Date: Thu, 2 Nov 2017 23:38:19 +0000 Subject: [AArch64][RegisterBankInfo] Add mapping for G_FPEXT. This fixes http://llvm.org/PR32560. We were missing a description for half floating point type and as a result were using the FPR 32 mapping. Because of the size mismatch the generic code was complaining that the default mapping is not appropriate. Fix the mapping description so that the default mapping can be properly applied. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317287 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64GenRegisterBankInfo.def | 43 +++++++++ lib/Target/AArch64/AArch64RegisterBankInfo.cpp | 32 +++++++ lib/Target/AArch64/AArch64RegisterBankInfo.h | 15 ++- .../AArch64/GlobalISel/arm64-regbankselect.mir | 104 +++++++++++++++++++++ 4 files changed, 193 insertions(+), 1 deletion(-) diff --git a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def index 8f17ae4534c..39f50ade747 100644 --- a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def +++ b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def @@ -98,6 +98,18 @@ RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{ // LastCrossRegCpyIdx. {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + // 41: FPExt: 16 to 32. <-- This must match FPExt16To32Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + // 43: FPExt: 16 to 32. <-- This must match FPExt16To64Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + // 45: FPExt: 32 to 64. <-- This must match FPExt32To64Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + // 47: FPExt vector: 64 to 128. <-- This must match FPExt64To128Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, }; bool AArch64GenRegisterBankInfo::checkPartialMap(unsigned Idx, @@ -217,4 +229,35 @@ AArch64GenRegisterBankInfo::getCopyMapping(unsigned DstBankID, ValMappingIdx <= LastCrossRegCpyIdx && "Mapping out of bound"); return &ValMappings[ValMappingIdx]; } + +const RegisterBankInfo::ValueMapping * +AArch64GenRegisterBankInfo::getFPExtMapping(unsigned DstSize, + unsigned SrcSize) { + // We support: + // - For Scalar: + // - 16 to 32. + // - 16 to 64. + // - 32 to 64. + // => FPR 16 to FPR 32|64 + // => FPR 32 to FPR 64 + // - For vectors: + // - v4f16 to v4f32 + // - v2f32 to v2f64 + // => FPR 64 to FPR 128 + + // Check that we have been asked sensible sizes. + if (SrcSize == 16) { + assert((DstSize == 32 || DstSize == 64) && "Unexpected half extension"); + if (DstSize == 32) + return &ValMappings[FPExt16To32Idx]; + return &ValMappings[FPExt16To64Idx]; + } + + if (SrcSize == 32) { + assert(DstSize == 64 && "Unexpected float extension"); + return &ValMappings[FPExt32To64Idx]; + } + assert((SrcSize == 64 || DstSize == 128) && "Unexpected vector extension"); + return &ValMappings[FPExt64To128Idx]; +} } // End llvm namespace. diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index 6e246a798c5..83bf493c9f0 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -175,6 +175,30 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64); CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64); +#define CHECK_VALUEMAP_FPEXT(DstSize, SrcSize) \ + do { \ + unsigned PartialMapDstIdx = PMI_FPR##DstSize - PMI_Min; \ + unsigned PartialMapSrcIdx = PMI_FPR##SrcSize - PMI_Min; \ + (void)PartialMapDstIdx; \ + (void)PartialMapSrcIdx; \ + const ValueMapping *Map = getFPExtMapping(DstSize, SrcSize); \ + (void)Map; \ + assert(Map[0].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] && \ + Map[0].NumBreakDowns == 1 && "FPR" #DstSize \ + " Dst is incorrectly initialized"); \ + assert(Map[1].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] && \ + Map[1].NumBreakDowns == 1 && "FPR" #SrcSize \ + " Src is incorrectly initialized"); \ + \ + } while (false) + + CHECK_VALUEMAP_FPEXT(32, 16); + CHECK_VALUEMAP_FPEXT(64, 16); + CHECK_VALUEMAP_FPEXT(64, 32); + CHECK_VALUEMAP_FPEXT(128, 64); + assert(verify(TRI) && "Invalid register bank information"); } @@ -455,6 +479,14 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case TargetOpcode::G_FMUL: case TargetOpcode::G_FDIV: return getSameKindOfOperandsMapping(MI); + case TargetOpcode::G_FPEXT: { + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + return getInstructionMapping( + DefaultMappingID, /*Cost*/ 1, + getFPExtMapping(DstTy.getSizeInBits(), SrcTy.getSizeInBits()), + /*NumOperands*/ 2); + } case TargetOpcode::COPY: { unsigned DstReg = MI.getOperand(0).getReg(); unsigned SrcReg = MI.getOperand(1).getReg(); diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.h b/lib/Target/AArch64/AArch64RegisterBankInfo.h index 384b9772927..008221dbef5 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.h +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.h @@ -53,7 +53,11 @@ protected: DistanceBetweenRegBanks = 3, FirstCrossRegCpyIdx = 25, LastCrossRegCpyIdx = 39, - DistanceBetweenCrossRegCpy = 2 + DistanceBetweenCrossRegCpy = 2, + FPExt16To32Idx = 41, + FPExt16To64Idx = 43, + FPExt32To64Idx = 45, + FPExt64To128Idx = 47, }; static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx, @@ -82,6 +86,15 @@ protected: static const RegisterBankInfo::ValueMapping * getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size); + /// Get the instruction mapping for G_FPEXT. + /// + /// \pre (DstSize, SrcSize) pair is one of the following: + /// (32, 16), (64, 16), (64, 32), (128, 64) + /// + /// \return An InstructionMapping with statically allocated OperandsMapping. + static const RegisterBankInfo::ValueMapping * + getFPExtMapping(unsigned DstSize, unsigned SrcSize); + #define GET_TARGET_REGBANK_CLASS #include "AArch64GenRegisterBank.inc" }; diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir index 4042047dfc2..cc158a29c3e 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir +++ b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir @@ -92,6 +92,10 @@ store double %vres, double* %addr ret void } + + define void @fp16Ext32() { ret void } + define void @fp16Ext64() { ret void } + define void @fp32Ext64() { ret void } ... --- @@ -742,3 +746,103 @@ body: | RET_ReallyLR ... + +--- +# Make sure we map FPEXT on FPR register bank. +# CHECK-LABEL: name: fp16Ext32 +name: fp16Ext32 +alignment: 2 +legalized: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' } +# CHECK-NEXT: - { id: 1, class: gpr, preferred-register: '' } +# CHECK-NEXT: - { id: 2, class: fpr, preferred-register: '' } +# CHECK-NEXT: - { id: 3, class: fpr, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +# CHECK: %1:gpr(s32) = COPY %w0 +# CHECK-NEXT: %0:gpr(s16) = G_TRUNC %1 +# %0 has been mapped to GPR, we need to repair to match FPR. +# CHECK-NEXT: %3:fpr(s16) = COPY %0 +# CHECK-NEXT: %2:fpr(s32) = G_FPEXT %3 +# CHECK-NEXT: %s0 = COPY %2 +# CHECK-NEXT: RET_ReallyLR + +body: | + bb.1: + liveins: %w0 + + %1(s32) = COPY %w0 + %0(s16) = G_TRUNC %1(s32) + %2(s32) = G_FPEXT %0(s16) + %s0 = COPY %2(s32) + RET_ReallyLR implicit %s0 + +... + +--- +# Make sure we map FPEXT on FPR register bank. +# CHECK-LABEL: name: fp16Ext64 +name: fp16Ext64 +alignment: 2 +legalized: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' } +# CHECK-NEXT: - { id: 1, class: gpr, preferred-register: '' } +# CHECK-NEXT: - { id: 2, class: fpr, preferred-register: '' } +# CHECK-NEXT: - { id: 3, class: fpr, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +# CHECK: %1:gpr(s32) = COPY %w0 +# CHECK-NEXT: %0:gpr(s16) = G_TRUNC %1 +# %0 has been mapped to GPR, we need to repair to match FPR. +# CHECK-NEXT: %3:fpr(s16) = COPY %0 +# CHECK-NEXT: %2:fpr(s64) = G_FPEXT %3 +# CHECK-NEXT: %d0 = COPY %2 +# CHECK-NEXT: RET_ReallyLR + +body: | + bb.1: + liveins: %w0 + + %1(s32) = COPY %w0 + %0(s16) = G_TRUNC %1(s32) + %2(s64) = G_FPEXT %0(s16) + %d0 = COPY %2(s64) + RET_ReallyLR implicit %d0 + +... + +--- +# Make sure we map FPEXT on FPR register bank. +# CHECK-LABEL: name: fp32Ext64 +name: fp32Ext64 +alignment: 2 +legalized: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' } +# CHECK-NEXT: - { id: 1, class: fpr, preferred-register: '' } +# CHECK-NEXT: - { id: 2, class: fpr, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +# CHECK: %0:gpr(s32) = COPY %w0 +# %0 has been mapped to GPR, we need to repair to match FPR. +# CHECK-NEXT: %2:fpr(s32) = COPY %0 +# CHECK-NEXT: %1:fpr(s64) = G_FPEXT %2 +# CHECK-NEXT: %d0 = COPY %1 +# CHECK-NEXT: RET_ReallyLR +body: | + bb.1: + liveins: %w0 + + %0(s32) = COPY %w0 + %1(s64) = G_FPEXT %0(s32) + %d0 = COPY %1(s64) + RET_ReallyLR implicit %d0 + +... -- cgit v1.2.3 From b57c6f4150d6525d1c1390fdd84f8ca4151eb570 Mon Sep 17 00:00:00 2001 From: Vedant Kumar Date: Thu, 2 Nov 2017 23:44:20 +0000 Subject: [Verifier] Remove the -verify-debug-info cl::opt This cl::opt has been dead for a while. It's no longer possible to run the verifier without also verifying debug info. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317288 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/IR/Verifier.cpp | 2 -- test/CodeGen/NVPTX/generic-to-nvvm-ir.ll | 2 +- test/DebugInfo/Generic/location-verifier.ll | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp index c528f7167e7..5bb1f84d2e5 100644 --- a/lib/IR/Verifier.cpp +++ b/lib/IR/Verifier.cpp @@ -115,8 +115,6 @@ using namespace llvm; -static cl::opt VerifyDebugInfo("verify-debug-info", cl::init(true)); - namespace llvm { struct VerifierSupport { diff --git a/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll b/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll index f874148c0e8..5df5183dc2f 100644 --- a/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll +++ b/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll @@ -1,6 +1,6 @@ ; Verify functionality of NVPTXGenericToNVVM.cpp pass. ; -; RUN: opt < %s -march nvptx64 -S -generic-to-nvvm -verify-debug-info | FileCheck %s +; RUN: opt < %s -march nvptx64 -S -generic-to-nvvm | FileCheck %s target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" diff --git a/test/DebugInfo/Generic/location-verifier.ll b/test/DebugInfo/Generic/location-verifier.ll index b1e0805428c..3c6bb425a66 100644 --- a/test/DebugInfo/Generic/location-verifier.ll +++ b/test/DebugInfo/Generic/location-verifier.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as -disable-output -verify-debug-info -o - < %s 2>&1 | FileCheck %s +; RUN: llvm-as -disable-output -o - < %s 2>&1 | FileCheck %s ; ModuleID = 'test.c' target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.10.0" -- cgit v1.2.3 From c7ddffcd3298d393e914d823a7c34d0915588bb7 Mon Sep 17 00:00:00 2001 From: Jake Ehrlich Date: Thu, 2 Nov 2017 23:45:51 +0000 Subject: Reland "Add feature to determine if host architecture is 64-bit in llvm-lit" A member of config was removed in this patch which resulted in errors I didn't expect. Removing config.host_arch will take more work some I'm readding that field. Differential Revision: https://reviews.llvm.org/D39465 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317289 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/lit.site.cfg.py.in | 1 + 1 file changed, 1 insertion(+) diff --git a/test/lit.site.cfg.py.in b/test/lit.site.cfg.py.in index efdd016e45d..dff46dcff32 100644 --- a/test/lit.site.cfg.py.in +++ b/test/lit.site.cfg.py.in @@ -42,6 +42,7 @@ config.build_shared_libs = @BUILD_SHARED_LIBS@ config.link_llvm_dylib = @LLVM_LINK_LLVM_DYLIB@ config.llvm_libxml2_enabled = "@LLVM_LIBXML2_ENABLED@" config.llvm_host_triple = '@LLVM_HOST_TRIPLE@' +config.host_arch = "@HOST_ARCH@" # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. -- cgit v1.2.3 From 1bd292583c0be67468743c6ae20046b6019f4da3 Mon Sep 17 00:00:00 2001 From: Sriraman Tallam Date: Fri, 3 Nov 2017 00:10:19 +0000 Subject: Avoid PLT for external calls when attribute nonlazybind is used. Differential Revision: https://reviews.llvm.org/D39065 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317292 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86Subtarget.cpp | 11 +++++++++-- test/CodeGen/X86/no-plt.ll | 23 +++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/X86/no-plt.ll diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index b0ce1335bd3..9e060f97df3 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -144,6 +144,15 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV) const { unsigned char X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV, const Module &M) const { + const Function *F = dyn_cast_or_null(GV); + + // Do not use the PLT when explicitly told to do so for ELF 64-bit + // target. + if (isTargetELF() && is64Bit() && F && + F->hasFnAttribute(Attribute::NonLazyBind) && + GV->isDeclarationForLinker()) + return X86II::MO_GOTPCREL; + if (TM.shouldAssumeDSOLocal(M, GV)) return X86II::MO_NO_FLAG; @@ -153,8 +162,6 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV, return X86II::MO_DLLIMPORT; } - const Function *F = dyn_cast_or_null(GV); - if (isTargetELF()) { if (is64Bit() && F && (CallingConv::X86_RegCall == F->getCallingConv())) // According to psABI, PLT stub clobbers XMM8-XMM15. diff --git a/test/CodeGen/X86/no-plt.ll b/test/CodeGen/X86/no-plt.ll new file mode 100644 index 00000000000..77ef686cc85 --- /dev/null +++ b/test/CodeGen/X86/no-plt.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \ +; RUN: | FileCheck -check-prefix=X64 %s +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu \ +; RUN: | FileCheck -check-prefix=X64 %s + +define i32 @main() #0 { +; X64: callq *_Z3foov@GOTPCREL(%rip) +; X64: callq _Z3barv + +entry: + %retval = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + %call1 = call i32 @_Z3foov() + %call2 = call i32 @_Z3barv() + ret i32 0 +} + +; Function Attrs: nonlazybind +declare i32 @_Z3foov() #1 + +declare i32 @_Z3barv() #2 + +attributes #1 = { nonlazybind } -- cgit v1.2.3 From 931b3020257e13988c2498855ffa76e16afa7712 Mon Sep 17 00:00:00 2001 From: Vedant Kumar Date: Fri, 3 Nov 2017 01:01:28 +0000 Subject: [LSR] Clarify a comment. NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317295 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/LoopStrengthReduce.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index bbb179d3790..7f03f2379e7 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1037,7 +1037,7 @@ struct LSRFixup { Value *OperandValToReplace = nullptr; /// If this user is to use the post-incremented value of an induction - /// variable, this variable is non-null and holds the loop associated with the + /// variable, this set is non-empty and holds the loops associated with the /// induction variable. PostIncLoopSet PostIncLoops; -- cgit v1.2.3 From 06d5ebdc63193e9cc621d9d85d57b4205981f692 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Nov 2017 05:19:34 +0000 Subject: [TableGen] Add an extra blank line to DAGISel output file to separate functions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317298 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/TableGen/DAGISelMatcherEmitter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/TableGen/DAGISelMatcherEmitter.cpp b/utils/TableGen/DAGISelMatcherEmitter.cpp index 76370cdad67..672f9f8620f 100644 --- a/utils/TableGen/DAGISelMatcherEmitter.cpp +++ b/utils/TableGen/DAGISelMatcherEmitter.cpp @@ -974,7 +974,7 @@ void llvm::EmitMatcherTable(const Matcher *TheMatcher, OS << " #undef TARGET_VAL\n"; OS << " SelectCodeCommon(N, MatcherTable,sizeof(MatcherTable));\n"; - OS << "}\n"; + OS << "}\n\n"; // Next up, emit the function for node and pattern predicates: MatcherEmitter.EmitPredicateFunctions(OS); -- cgit v1.2.3 From c43a693efb02155f32e2f61310262082d27f91f3 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Nov 2017 06:48:02 +0000 Subject: [X86] Remove PALIGNR/VALIGN handling from combineBitcastForMaskedOp and move to isel patterns instead. Prefer 128-bit VALIGND/VALIGNQ over PALIGNR during lowering when possible. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317299 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 36 +++----- lib/Target/X86/X86InstrAVX512.td | 117 ++++++++++++++++++++++++ test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll | 4 +- 3 files changed, 133 insertions(+), 24 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b178ad6c13e..d64cc411391 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -10716,10 +10716,16 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. - if (Subtarget.hasSSSE3()) + if (Subtarget.hasSSSE3()) { + if (Subtarget.hasVLX()) + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; + } // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. @@ -11016,10 +11022,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. - if (Subtarget.hasSSSE3()) + if (Subtarget.hasSSSE3()) { + if (Subtarget.hasVLX()) + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + } // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). @@ -30674,26 +30686,6 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, unsigned Opcode = Op.getOpcode(); switch (Opcode) { - case X86ISD::PALIGNR: - // PALIGNR can be converted to VALIGND/Q for 128-bit vectors. - if (!VT.is128BitVector()) - return false; - Opcode = X86ISD::VALIGN; - LLVM_FALLTHROUGH; - case X86ISD::VALIGN: { - if (EltVT != MVT::i32 && EltVT != MVT::i64) - return false; - uint64_t Imm = Op.getConstantOperandVal(2); - MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); - unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits(); - unsigned EltSize = EltVT.getSizeInBits(); - // Make sure we can represent the same shift with the new VT. - if ((ShiftAmt % EltSize) != 0) - return false; - Imm = ShiftAmt / EltSize; - return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1), - DAG.getConstant(Imm, DL, MVT::i8)); - } case X86ISD::SHUF128: { if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64) return false; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index a73ee19423d..17b5e10c6a4 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -8911,6 +8911,123 @@ defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" , avx512vl_i8_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>; +// Fragments to help convert valignq into masked valignd. Or valignq/valignd +// into vpalignr. +def ValignqImm32XForm : SDNodeXFormgetZExtValue() * 2, SDLoc(N)); +}]>; +def ValignqImm8XForm : SDNodeXFormgetZExtValue() * 8, SDLoc(N)); +}]>; +def ValigndImm8XForm : SDNodeXFormgetZExtValue() * 4, SDLoc(N)); +}]>; + +multiclass avx512_vpalign_mask_lowering { + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, From.RC:$src2, + imm:$src3))), + To.RC:$src0)), + (!cast(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask, + To.RC:$src1, To.RC:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, From.RC:$src2, + imm:$src3))), + To.ImmAllZerosV)), + (!cast(OpcodeStr#"rrikz") To.KRCWM:$mask, + To.RC:$src1, To.RC:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert (To.LdFrag addr:$src2)), + imm:$src3))), + To.RC:$src0)), + (!cast(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert (To.LdFrag addr:$src2)), + imm:$src3))), + To.ImmAllZerosV)), + (!cast(OpcodeStr#"rmikz") To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; +} + +multiclass avx512_vpalign_mask_lowering_mb : + avx512_vpalign_mask_lowering { + def : Pat<(From.VT (OpNode From.RC:$src1, + (bitconvert (To.VT (X86VBroadcast + (To.ScalarLdFrag addr:$src2)))), + imm:$src3)), + (!cast(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert + (To.VT (X86VBroadcast + (To.ScalarLdFrag addr:$src2)))), + imm:$src3))), + To.RC:$src0)), + (!cast(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert + (To.VT (X86VBroadcast + (To.ScalarLdFrag addr:$src2)))), + imm:$src3))), + To.ImmAllZerosV)), + (!cast(OpcodeStr#"rmbikz") To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; +} + +let Predicates = [HasAVX512] in { + // For 512-bit we lower to the widest element type we can. So we only need + // to handle converting valignq to valignd. + defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ", X86VAlign, v8i64_info, + v16i32_info, ValignqImm32XForm>; +} + +let Predicates = [HasVLX] in { + // For 128-bit we lower to the widest element type we can. So we only need + // to handle converting valignq to valignd. + defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ128", X86VAlign, v2i64x_info, + v4i32x_info, ValignqImm32XForm>; + // For 256-bit we lower to the widest element type we can. So we only need + // to handle converting valignq to valignd. + defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ256", X86VAlign, v4i64x_info, + v8i32x_info, ValignqImm32XForm>; +} + +let Predicates = [HasVLX, HasBWI] in { + // We can turn 128 and 256 bit VALIGND/VALIGNQ into VPALIGNR. + defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v2i64x_info, + v16i8x_info, ValignqImm8XForm>; + defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v4i32x_info, + v16i8x_info, ValigndImm8XForm>; +} + defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" , avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>; diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index b6723ee50b0..6c6fad794c8 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -4712,8 +4712,8 @@ declare <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32>, <8 x i32>, i32, define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x03,0xd9,0x06] -; CHECK-NEXT: ## ymm3 = ymm1[6,7],ymm0[0,1,2,3,4,5] +; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03] +; CHECK-NEXT: ## ymm3 = ymm1[3],ymm0[0,1,2] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x06] ; CHECK-NEXT: ## ymm2 {%k1} = ymm1[6,7],ymm0[0,1,2,3,4,5] -- cgit v1.2.3 From 37104fff45a7783a64ab121f9aee29da922678dd Mon Sep 17 00:00:00 2001 From: Martin Storsjo Date: Fri, 3 Nov 2017 07:18:14 +0000 Subject: [llvm-nm] Print 'I' for import table data in COFF The character gets uppercased into 'I' when it's a global symbol. In GNU binutils, nm prints 'I' for symbols classified by bfd_is_ind_section - which probably isn't exactly/only import tables. When building for win32, (some incarnations of?) libtool has got rules that try to inspect linked libraries, and in order to be sure that it is linking to a DLL import library as opposed to a static library, it expects to find the string " I " in the output of $NM when run on such an import library. GNU binutils nm also flags all of the .idata$X chunks as 'i' (while this patch only makes it set on .idata$2 and .idata$6) and also flags __imp__function as 'I'. Differential Revision: https://reviews.llvm.org/D39540 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317300 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-nm/X86/importlibrary.test | 2 ++ tools/llvm-nm/llvm-nm.cpp | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/test/tools/llvm-nm/X86/importlibrary.test b/test/tools/llvm-nm/X86/importlibrary.test index 9111694c2c6..107628d09ef 100644 --- a/test/tools/llvm-nm/X86/importlibrary.test +++ b/test/tools/llvm-nm/X86/importlibrary.test @@ -1,5 +1,7 @@ # RUN: llvm-nm -B %S/Inputs/example.lib | FileCheck --match-full-lines %s +CHECK: 00000000 I __IMPORT_DESCRIPTOR_example +CHECK: 00000000 I __NULL_IMPORT_DESCRIPTOR CHECK: 00000000 R __imp__constant CHECK: 00000000 R _constant CHECK: 00000000 D __imp__data diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp index 4ad0d95d67f..1b093f501d5 100644 --- a/tools/llvm-nm/llvm-nm.cpp +++ b/tools/llvm-nm/llvm-nm.cpp @@ -946,6 +946,10 @@ static char getSymbolNMTypeChar(COFFObjectFile &Obj, symbol_iterator I) { section_iterator SecI = *SecIOrErr; const coff_section *Section = Obj.getCOFFSection(*SecI); Characteristics = Section->Characteristics; + StringRef SectionName; + Obj.getSectionName(Section, SectionName); + if (SectionName.startswith(".idata")) + return 'i'; } switch (Symb.getSectionNumber()) { -- cgit v1.2.3 From f30757f3b0769ec6504e75254f95b66f6dd0f50c Mon Sep 17 00:00:00 2001 From: Martin Storsjo Date: Fri, 3 Nov 2017 07:18:21 +0000 Subject: [llvm-nm] Don't error out on multiple occurrances of the -g/--external-only flag GNU binutils nm doesn't error out on this, and some projects' build systems can end up doing that in some cases. Allowing that seems like a better target than trying to avoid user projects passing multiple -g parameters to $NM. Differential Revision: https://reviews.llvm.org/D39539 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317301 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-nm/X86/externalonly.test | 1 + tools/llvm-nm/llvm-nm.cpp | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/test/tools/llvm-nm/X86/externalonly.test b/test/tools/llvm-nm/X86/externalonly.test index c3741298786..2a1853b426f 100644 --- a/test/tools/llvm-nm/X86/externalonly.test +++ b/test/tools/llvm-nm/X86/externalonly.test @@ -1,4 +1,5 @@ # RUN: llvm-nm -g %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s +# RUN: llvm-nm -g -g %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s # CHECK-NOT: EH_frame0 # CHECK: _main diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp index 1b093f501d5..85204300284 100644 --- a/tools/llvm-nm/llvm-nm.cpp +++ b/tools/llvm-nm/llvm-nm.cpp @@ -85,9 +85,11 @@ cl::alias DefinedOnly2("U", cl::desc("Alias for --defined-only"), cl::aliasopt(DefinedOnly), cl::Grouping); cl::opt ExternalOnly("extern-only", - cl::desc("Show only external symbols")); + cl::desc("Show only external symbols"), + cl::ZeroOrMore); cl::alias ExternalOnly2("g", cl::desc("Alias for --extern-only"), - cl::aliasopt(ExternalOnly), cl::Grouping); + cl::aliasopt(ExternalOnly), cl::Grouping, + cl::ZeroOrMore); cl::opt BSDFormat("B", cl::desc("Alias for --format=bsd"), cl::Grouping); -- cgit v1.2.3 From 691ff5f85039e136ceeca85db6c2b368e2729fba Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Fri, 3 Nov 2017 07:30:45 +0000 Subject: [NFC] Get rid of hard-coded value ID in test git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317303 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Transforms/IRCE/add-metadata-pre-post-loops.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Transforms/IRCE/add-metadata-pre-post-loops.ll b/test/Transforms/IRCE/add-metadata-pre-post-loops.ll index 488d4b479ba..0225af903ef 100644 --- a/test/Transforms/IRCE/add-metadata-pre-post-loops.ll +++ b/test/Transforms/IRCE/add-metadata-pre-post-loops.ll @@ -38,7 +38,7 @@ exit: ; preds = %in.bounds, %entry define void @single_access_with_preloop(i32 *%arr, i32 *%a_len_ptr, i32 %n, i32 %offset) { ; CHECK-LABEL: @single_access_with_preloop( ; CHECK-LABEL: in.bounds.preloop -; CHECK: br i1 %14, label %loop.preloop, label %preloop.exit.selector, !llvm.loop !8, !irce.loop.clone !7 +; CHECK: br i1 [[COND:%[^ ]+]], label %loop.preloop, label %preloop.exit.selector, !llvm.loop !8, !irce.loop.clone !7 ; CHECK-LABEL: in.bounds.postloop ; CHECK: br i1 %next.postloop, label %loop.postloop, label %exit.loopexit.loopexit, !llvm.loop !9, !irce.loop.clone !7 entry: -- cgit v1.2.3 From 19a3ba35df240471e549b2c4d0c7da390ec0064d Mon Sep 17 00:00:00 2001 From: Martin Storsjo Date: Fri, 3 Nov 2017 07:33:20 +0000 Subject: [AArch64] Use dwarf exception handling on MinGW Ideally we should probably produce WinEH here as well, but until then, we can use dwarf exceptions, without any further changes required in clang, libunwind or libcxxabi. Differential Revision: https://reviews.llvm.org/D39535 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317304 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp | 4 ++- .../MCTargetDesc/AArch64WinCOFFStreamer.cpp | 8 +++++ test/CodeGen/AArch64/dwarf-cfi.ll | 36 ++++++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/AArch64/dwarf-cfi.ll diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 7fba4849438..c5da457c38f 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -106,13 +106,15 @@ AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() { PrivateLabelPrefix = ".L"; AlignmentIsInBytes = false; SupportsDebugInformation = true; - ExceptionsType = ExceptionHandling::WinEH; + CodePointerSize = 8; } AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() { CommentString = ";"; + ExceptionsType = ExceptionHandling::WinEH; } AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() { CommentString = "//"; + ExceptionsType = ExceptionHandling::DwarfCFI; } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp index 9d0f39e5f6a..c88363d2c25 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp @@ -23,7 +23,15 @@ public: std::unique_ptr CE, raw_pwrite_stream &OS) : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), OS) {} + + void FinishImpl() override; }; + +void AArch64WinCOFFStreamer::FinishImpl() { + EmitFrames(nullptr); + + MCWinCOFFStreamer::FinishImpl(); +} } // end anonymous namespace namespace llvm { diff --git a/test/CodeGen/AArch64/dwarf-cfi.ll b/test/CodeGen/AArch64/dwarf-cfi.ll new file mode 100644 index 00000000000..a75bcd19c69 --- /dev/null +++ b/test/CodeGen/AArch64/dwarf-cfi.ll @@ -0,0 +1,36 @@ +; RUN: llc -mtriple aarch64-windows-gnu -filetype=asm -o - %s | FileCheck %s + +define void @_Z1gv() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + invoke void @_Z1fv() + to label %try.cont unwind label %lpad + +lpad: + %0 = landingpad { i8*, i32 } + catch i8* null + %1 = extractvalue { i8*, i32 } %0, 0 + %2 = tail call i8* @__cxa_begin_catch(i8* %1) #2 + tail call void @__cxa_end_catch() + br label %try.cont + +try.cont: + ret void +} + +declare void @_Z1fv() + +declare i32 @__gxx_personality_v0(...) + +declare i8* @__cxa_begin_catch(i8*) + +declare void @__cxa_end_catch() + +; CHECK-LABEL: _Z1gv: +; CHECK: .cfi_startproc +; CHECK: .cfi_personality 0, __gxx_personality_v0 +; CHECK: .cfi_lsda 0, .Lexception0 +; CHECK: str x30, [sp, #-16]! +; CHECK: .cfi_def_cfa_offset 16 +; CHECK: .cfi_offset w30, -16 +; CHECK: ldr x30, [sp], #16 +; CHECK: .cfi_endproc -- cgit v1.2.3 From ba9125e489dab87e4706bef3408a943967e86415 Mon Sep 17 00:00:00 2001 From: Francis Visoiu Mistrih Date: Fri, 3 Nov 2017 09:46:36 +0000 Subject: [PEI] Simplify handling of targets with no phys regs. NFC Make doSpillCalleeSavedRegs a member function, instead of passing most of the members of PEI as arguments. Differential Review: https://reviews.llvm.org/D35642 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317309 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/PrologEpilogInserter.cpp | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp index d9e9b3360a0..d611c9b45c5 100644 --- a/lib/CodeGen/PrologEpilogInserter.cpp +++ b/lib/CodeGen/PrologEpilogInserter.cpp @@ -76,12 +76,6 @@ using namespace llvm; using MBBVector = SmallVector; -static void spillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS, - unsigned &MinCSFrameIndex, - unsigned &MaxCXFrameIndex, - const MBBVector &SaveBlocks, - const MBBVector &RestoreBlocks); - namespace { class PEI : public MachineFunctionPass { @@ -125,6 +119,7 @@ private: void calculateCallFrameInfo(MachineFunction &Fn); void calculateSaveRestoreBlocks(MachineFunction &Fn); + void spillCalleeSavedRegs(MachineFunction &MF); void calculateFrameObjectOffsets(MachineFunction &Fn); void replaceFrameIndices(MachineFunction &Fn); @@ -197,8 +192,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) { // Handle CSR spilling and restoring, for targets that need it. if (Fn.getTarget().usesPhysRegsForPEI()) - spillCalleeSavedRegs(Fn, RS, MinCSFrameIndex, MaxCSFrameIndex, SaveBlocks, - RestoreBlocks); + spillCalleeSavedRegs(Fn); // Allow the target machine to make final modifications to the function // before the frame layout is finalized. @@ -505,11 +499,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock, } } -static void spillCalleeSavedRegs(MachineFunction &Fn, RegScavenger *RS, - unsigned &MinCSFrameIndex, - unsigned &MaxCSFrameIndex, - const MBBVector &SaveBlocks, - const MBBVector &RestoreBlocks) { +void PEI::spillCalleeSavedRegs(MachineFunction &Fn) { // We can't list this requirement in getRequiredProperties because some // targets (WebAssembly) use virtual registers past this point, and the pass // pipeline is set up without giving the passes a chance to look at the -- cgit v1.2.3 From 74ecc3ab6b5c584d09919664232287258270e39b Mon Sep 17 00:00:00 2001 From: "Ivan A. Kosarev" Date: Fri, 3 Nov 2017 10:26:25 +0000 Subject: [Analysis] Refine matching and merging of TBAA tags This patch combines the code that matches and merges TBAA access tags. The aim is to simplify future changes and making sure that these operations produce consistent results. Differential Revision: https://reviews.llvm.org/D39463 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317311 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Analysis/TypeBasedAliasAnalysis.cpp | 173 ++++++++++++++++++-------------- 1 file changed, 95 insertions(+), 78 deletions(-) diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp index 3a3a7ad3955..8812ca207ba 100644 --- a/lib/Analysis/TypeBasedAliasAnalysis.cpp +++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp @@ -314,17 +314,8 @@ AliasResult TypeBasedAAResult::alias(const MemoryLocation &LocA, if (!EnableTBAA) return AAResultBase::alias(LocA, LocB); - // Get the attached MDNodes. If either value lacks a tbaa MDNode, we must - // be conservative. - const MDNode *AM = LocA.AATags.TBAA; - if (!AM) - return AAResultBase::alias(LocA, LocB); - const MDNode *BM = LocB.AATags.TBAA; - if (!BM) - return AAResultBase::alias(LocA, LocB); - - // If they may alias, chain to the next AliasAnalysis. - if (Aliases(AM, BM)) + // If accesses may alias, chain to the next AliasAnalysis. + if (Aliases(LocA.AATags.TBAA, LocB.AATags.TBAA)) return AAResultBase::alias(LocA, LocB); // Otherwise return a definitive result. @@ -424,25 +415,24 @@ bool MDNode::isTBAAVtableAccess() const { return false; } +static bool matchAccessTags(const MDNode *A, const MDNode *B, + const MDNode **GenericTag = nullptr); + MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) { + const MDNode *GenericTag; + matchAccessTags(A, B, &GenericTag); + return const_cast(GenericTag); +} + +static const MDNode *getLeastCommonType(const MDNode *A, const MDNode *B) { if (!A || !B) return nullptr; if (A == B) return A; - // For struct-path aware TBAA, we use the access type of the tag. - assert(isStructPathTBAA(A) && isStructPathTBAA(B) && - "Auto upgrade should have taken care of this!"); - A = cast_or_null(MutableTBAAStructTagNode(A).getAccessType()); - if (!A) - return nullptr; - B = cast_or_null(MutableTBAAStructTagNode(B).getAccessType()); - if (!B) - return nullptr; - - SmallSetVector PathA; - MutableTBAANode TA(A); + SmallSetVector PathA; + TBAANode TA(A); while (TA.getNode()) { if (PathA.count(TA.getNode())) report_fatal_error("Cycle found in TBAA metadata."); @@ -450,8 +440,8 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) { TA = TA.getParent(); } - SmallSetVector PathB; - MutableTBAANode TB(B); + SmallSetVector PathB; + TBAANode TB(B); while (TB.getNode()) { if (PathB.count(TB.getNode())) report_fatal_error("Cycle found in TBAA metadata."); @@ -462,7 +452,7 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) { int IA = PathA.size() - 1; int IB = PathB.size() - 1; - MDNode *Ret = nullptr; + const MDNode *Ret = nullptr; while (IA >= 0 && IB >= 0) { if (PathA[IA] == PathB[IB]) Ret = PathA[IA]; @@ -472,17 +462,7 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) { --IB; } - // We either did not find a match, or the only common base "type" is - // the root node. In either case, we don't have any useful TBAA - // metadata to attach. - if (!Ret || Ret->getNumOperands() < 2) - return nullptr; - - // We need to convert from a type node to a tag node. - Type *Int64 = IntegerType::get(A->getContext(), 64); - Metadata *Ops[3] = {Ret, Ret, - ConstantAsMetadata::get(ConstantInt::get(Int64, 0))}; - return MDNode::get(A->getContext(), Ops); + return Ret; } void Instruction::getAAMetadata(AAMDNodes &N, bool Merge) const { @@ -505,70 +485,107 @@ void Instruction::getAAMetadata(AAMDNodes &N, bool Merge) const { N.NoAlias = getMetadata(LLVMContext::MD_noalias); } -/// Aliases - Test whether the type represented by A may alias the -/// type represented by B. -bool TypeBasedAAResult::Aliases(const MDNode *A, const MDNode *B) const { +static bool findAccessType(TBAAStructTagNode BaseTag, + const MDNode *AccessTypeNode, + uint64_t &OffsetInBase) { + // Start from the base type, follow the edge with the correct offset in + // the type DAG and adjust the offset until we reach the access type or + // until we reach a root node. + TBAAStructTypeNode BaseType(BaseTag.getBaseType()); + OffsetInBase = BaseTag.getOffset(); + + while (const MDNode *BaseTypeNode = BaseType.getNode()) { + if (BaseTypeNode == AccessTypeNode) + return true; + + // Follow the edge with the correct offset, Offset will be adjusted to + // be relative to the field type. + BaseType = BaseType.getParent(OffsetInBase); + } + return false; +} + +static const MDNode *createAccessTag(const MDNode *AccessType) { + Type *Int64 = IntegerType::get(AccessType->getContext(), 64); + auto *ImmutabilityFlag = ConstantAsMetadata::get(ConstantInt::get(Int64, 0)); + Metadata *Ops[] = {const_cast(AccessType), + const_cast(AccessType), ImmutabilityFlag}; + return MDNode::get(AccessType->getContext(), Ops); +} + +/// matchTags - Return true if the given couple of accesses are allowed to +/// overlap. If \arg GenericTag is not null, then on return it points to the +/// most generic access descriptor for the given two. +static bool matchAccessTags(const MDNode *A, const MDNode *B, + const MDNode **GenericTag) { + if (A == B) { + if (GenericTag) + *GenericTag = A; + return true; + } + + // Accesses with no TBAA information may alias with any other accesses. + if (!A || !B) { + if (GenericTag) + *GenericTag = nullptr; + return true; + } + // Verify that both input nodes are struct-path aware. Auto-upgrade should // have taken care of this. - assert(isStructPathTBAA(A) && "MDNode A is not struct-path aware."); - assert(isStructPathTBAA(B) && "MDNode B is not struct-path aware."); + assert(isStructPathTBAA(A) && "Access A is not struct-path aware!"); + assert(isStructPathTBAA(B) && "Access B is not struct-path aware!"); - // Keep track of the root node for A and B. - TBAAStructTypeNode RootA, RootB; TBAAStructTagNode TagA(A), TagB(B); // TODO: We need to check if AccessType of TagA encloses AccessType of // TagB to support aggregate AccessType. If yes, return true. - // Start from the base type of A, follow the edge with the correct offset in - // the type DAG and adjust the offset until we reach the base type of B or - // until we reach the Root node. - // Compare the adjusted offset once we have the same base. - - // Climb the type DAG from base type of A to see if we reach base type of B. const MDNode *BaseA = TagA.getBaseType(); const MDNode *BaseB = TagB.getBaseType(); - uint64_t OffsetA = TagA.getOffset(), OffsetB = TagB.getOffset(); - for (TBAAStructTypeNode T(BaseA);;) { - if (T.getNode() == BaseB) - // Base type of A encloses base type of B, check if the offsets match. - return OffsetA == OffsetB; - - RootA = T; - // Follow the edge with the correct offset, OffsetA will be adjusted to - // be relative to the field type. - T = T.getParent(OffsetA); - if (!T.getNode()) - break; - } - // Reset OffsetA and climb the type DAG from base type of B to see if we reach - // base type of A. - OffsetA = TagA.getOffset(); - for (TBAAStructTypeNode T(BaseB);;) { - if (T.getNode() == BaseA) - // Base type of B encloses base type of A, check if the offsets match. - return OffsetA == OffsetB; + // Climb the type DAG from base type of A to see if we reach base type of B. + uint64_t OffsetA; + if (findAccessType(TagA, BaseB, OffsetA)) { + if (GenericTag) + *GenericTag = createAccessTag(TagB.getAccessType()); + return OffsetA == TagB.getOffset(); + } - RootB = T; - // Follow the edge with the correct offset, OffsetB will be adjusted to - // be relative to the field type. - T = T.getParent(OffsetB); - if (!T.getNode()) - break; + // Climb the type DAG from base type of B to see if we reach base type of A. + uint64_t OffsetB; + if (findAccessType(TagB, BaseA, OffsetB)) { + if (GenericTag) + *GenericTag = createAccessTag(TagA.getAccessType()); + return OffsetB == TagA.getOffset(); } - // Neither node is an ancestor of the other. + // If neither node is an ancestor of the other, then try to find the type + // that is common to both the final access types. + const MDNode *CommonType = getLeastCommonType(TagA.getAccessType(), + TagB.getAccessType()); + + // If there is no common type or the only common type is the root node, then + // we don't have any useful generic access tag to return. + if (GenericTag) + *GenericTag = !CommonType || CommonType->getNumOperands() < 2 ? + nullptr : createAccessTag(CommonType); // If they have different roots, they're part of different potentially // unrelated type systems, so we must be conservative. - if (RootA.getNode() != RootB.getNode()) + if (!CommonType) return true; // If they have the same root, then we've proved there's no alias. return false; } +/// Aliases - Test whether the access represented by tag A may alias the +/// access represented by tag B. +bool TypeBasedAAResult::Aliases(const MDNode *A, const MDNode *B) const { + return matchAccessTags(A, B); +} + AnalysisKey TypeBasedAA::Key; TypeBasedAAResult TypeBasedAA::run(Function &F, FunctionAnalysisManager &AM) { -- cgit v1.2.3 From 5281112161326303f9a4571f3c7492fc2f2be6e6 Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Fri, 3 Nov 2017 10:30:12 +0000 Subject: [ARM GlobalISel] Move the check for Thumb higher up We're currently bailing out for Thumb targets while lowering formal parameters, but there used to be some other checks before it, which could've caused some functions (e.g. those without formal parameters) to sneak through unnoticed. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317312 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMCallLowering.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp index e1323cd9427..9c10a1c79a4 100644 --- a/lib/Target/ARM/ARMCallLowering.cpp +++ b/lib/Target/ARM/ARMCallLowering.cpp @@ -417,6 +417,12 @@ struct FormalArgHandler : public IncomingValueHandler { bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef VRegs) const { + auto &TLI = *getTLI(); + auto Subtarget = TLI.getSubtarget(); + + if (Subtarget->isThumb()) + return false; + // Quick exit if there aren't any args if (F.arg_empty()) return true; @@ -427,12 +433,6 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, auto &MF = MIRBuilder.getMF(); auto &MBB = MIRBuilder.getMBB(); auto DL = MF.getDataLayout(); - auto &TLI = *getTLI(); - - auto Subtarget = TLI.getSubtarget(); - - if (Subtarget->isThumb()) - return false; for (auto &Arg : F.args()) if (!isSupportedType(DL, TLI, Arg.getType())) -- cgit v1.2.3 From a7372f15c92ca8a556487877cb9df960ad68f4e3 Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Fri, 3 Nov 2017 10:30:19 +0000 Subject: [globalisel][tablegen] Skip src child predicates The GlobalISel TableGen backend didn't check for predicates on the source children. This caused it to generate code for ARM patterns such as SMLABB or similar, but without properly checking for the sext_16_node part of the operands. This in turn meant that we would select SMLABB instead of MLA for simple sequences such as s32 + s32 * s32, which is wrong (we want a MLA on the full operands, not just their bottom 16 bits). This patch forces TableGen to skip patterns with predicates on the src children, so it doesn't generate code for SMLABB and other similar ARM instructions at all anymore. AArch64 and X86 are not affected. Differential Revision: https://reviews.llvm.org/D39554 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317313 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../GlobalISel/arm-instruction-select-combos.mir | 35 ++++++++++++++++++++++ utils/TableGen/GlobalISelEmitter.cpp | 3 ++ 2 files changed, 38 insertions(+) diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir index d96463f00c7..939c851584c 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir @@ -1,6 +1,7 @@ # RUN: llc -O0 -mtriple arm-- -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --- | define void @test_mla() #0 { ret void } + define void @test_mla_commutative() #0 { ret void } define void @test_mla_v5() #1 { ret void } define void @test_mls() #2 { ret void } @@ -45,6 +46,40 @@ body: | ; CHECK: BX_RET 14, _, implicit %r0 ... --- +name: test_mla_commutative +# CHECK-LABEL: name: test_mla_commutative +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } + - { id: 4, class: gprb } +body: | + bb.0: + liveins: %r0, %r1, %r2 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + ; CHECK: [[VREGX:%[0-9]+]]:gprnopc = COPY %r0 + ; CHECK: [[VREGY:%[0-9]+]]:gprnopc = COPY %r1 + ; CHECK: [[VREGZ:%[0-9]+]]:gprnopc = COPY %r2 + + %3(s32) = G_MUL %0, %1 + %4(s32) = G_ADD %2, %3 + ; CHECK: [[VREGR:%[0-9]+]]:gprnopc = MLA [[VREGX]], [[VREGY]], [[VREGZ]], 14, _, _ + + %r0 = COPY %4(s32) + ; CHECK: %r0 = COPY [[VREGR]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- name: test_mla_v5 # CHECK-LABEL: name: test_mla_v5 legalized: true diff --git a/utils/TableGen/GlobalISelEmitter.cpp b/utils/TableGen/GlobalISelEmitter.cpp index fed8ae5a80b..08649d7f9b5 100644 --- a/utils/TableGen/GlobalISelEmitter.cpp +++ b/utils/TableGen/GlobalISelEmitter.cpp @@ -2629,6 +2629,9 @@ Error GlobalISelEmitter::importChildMatcher(RuleMatcher &Rule, return Error::success(); } + if (SrcChild->hasAnyPredicate()) + return failedImport("Src pattern child has unsupported predicate"); + // Check for constant immediates. if (auto *ChildInt = dyn_cast(SrcChild->getLeafValue())) { OM.addPredicate(ChildInt->getValue()); -- cgit v1.2.3 From 6cd2a99eb6756292d2b78115f00d0fe9e1e35e23 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 3 Nov 2017 11:29:00 +0000 Subject: [PartialInliner] Skip call sites where inlining fails. Summary: InlineFunction can fail, for example when trying to inline vararg fuctions. In those cases, we do not want to bump partial inlining counters or set AnyInlined to true, because this could leave an unused function hanging around. Reviewers: davidxl, davide, gyiu Reviewed By: davide Subscribers: llvm-commits, eraman Differential Revision: https://reviews.llvm.org/D39581 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317314 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/IPO/PartialInlining.cpp | 16 ++++---- .../CodeExtractor/PartialInlineNoInline.ll | 45 ++++++++++++++++++++++ 2 files changed, 54 insertions(+), 7 deletions(-) create mode 100644 test/Transforms/CodeExtractor/PartialInlineNoInline.ll diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index b5267f75e41..c47d8b78df3 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -931,15 +931,17 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE)) continue; - ORE.emit([&]() { - return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", - CS.getInstruction()) - << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " - << ore::NV("Caller", CS.getCaller()); - }); + // Construct remark before doing the inlining, as after successful inlining + // the callsite is removed. + OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()); + OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " + << ore::NV("Caller", CS.getCaller()); InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); - InlineFunction(CS, IFI); + if (!InlineFunction(CS, IFI)) + continue; + + ORE.emit(OR); // Now update the entry count: if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { diff --git a/test/Transforms/CodeExtractor/PartialInlineNoInline.ll b/test/Transforms/CodeExtractor/PartialInlineNoInline.ll new file mode 100644 index 00000000000..6c0b83298d2 --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineNoInline.ll @@ -0,0 +1,45 @@ +; RUN: opt < %s -partial-inliner -S -stats -pass-remarks=partial-inlining 2>&1 | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -S -stats -pass-remarks=partial-inlining 2>&1 | FileCheck %s + +@stat = external global i32, align 4 + +define i32 @inline_fail(i32 %count, ...) { +entry: + %vargs = alloca i8*, align 8 + %vargs1 = bitcast i8** %vargs to i8* + call void @llvm.va_start(i8* %vargs1) + %stat1 = load i32, i32* @stat, align 4 + %cmp = icmp slt i32 %stat1, 0 + br i1 %cmp, label %bb2, label %bb1 + +bb1: ; preds = %entry + %vg1 = add nsw i32 %stat1, 1 + store i32 %vg1, i32* @stat, align 4 + %va1 = va_arg i8** %vargs, i32 + call void @foo(i32 %count, i32 %va1) #2 + br label %bb2 + +bb2: ; preds = %bb1, %entry + %res = phi i32 [ 1, %bb1 ], [ 0, %entry ] + call void @llvm.va_end(i8* %vargs1) + ret i32 %res +} + +define i32 @caller(i32 %arg) { +bb: + %res = tail call i32 (i32, ...) @inline_fail(i32 %arg, i32 %arg) + ret i32 %res +} + +declare void @foo(i32, i32) +declare void @llvm.va_start(i8*) +declare void @llvm.va_end(i8*) + +; Check that no remarks have been emitted, inline_fail has not been partial +; inlined, no code has been extracted and the partial-inlining counter +; has not been incremented. + +; CHECK-NOT: remark +; CHECK: tail call i32 (i32, ...) @inline_fail(i32 %arg, i32 %arg) +; CHECK-NOT: inline_fail.1_bb1 +; CHECK-NOT: partial-inlining -- cgit v1.2.3 From eb7c044ce99bf0576ab8017c0b63eb0f2d7e6c5b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 3 Nov 2017 11:33:48 +0000 Subject: [X86][SSE] Add PACKUS support to combineVectorTruncation Similar to the existing code to lower to PACKSS, we can use PACKUS if the input vector's leading zero bits extend all the way to the packed/truncated value. We have to account for pre-SSE41 targets not supporting PACKUSDW git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317315 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 22 +++-- test/CodeGen/X86/avg.ll | 185 ++++++++++++++++--------------------- test/CodeGen/X86/combine-srl.ll | 2 +- test/CodeGen/X86/vector-trunc.ll | 73 +++++++++------ 4 files changed, 141 insertions(+), 141 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d64cc411391..d65a65e365c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -34433,8 +34433,9 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// This function transforms vector truncation of 'extended sign-bits' values. -/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations. +/// This function transforms vector truncation of 'extended sign-bits' or +/// 'extended zero-bits' values. +/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations. static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -34467,10 +34468,19 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL, // packed/truncated value. e.g. Comparison result, sext_in_reg, etc. unsigned NumSignBits = DAG.ComputeNumSignBits(In); unsigned NumPackedBits = std::min(SVT.getSizeInBits(), 16); - if (NumSignBits <= (InSVT.getSizeInBits() - NumPackedBits)) - return SDValue(); + if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits)) + return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); + + // Use PACKUS if the input has zero-bits that extend all the way to the + // packed/truncated value. e.g. masks, zext_in_reg, etc. + KnownBits Known; + DAG.computeKnownBits(In, Known); + unsigned NumLeadingZeroBits = Known.countMinLeadingZeros(); + NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8; + if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits)) + return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget); - return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); + return SDValue(); } static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, @@ -34499,7 +34509,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc); } - // Try to truncate extended sign bits with PACKSS. + // Try to truncate extended sign/zero bits with PACKSS/PACKUS. if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget)) return V; diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index 508f10e9889..14494779f10 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -2209,62 +2209,53 @@ define void @avg_v16i8_const(<16 x i8>* %a) nounwind { define void @avg_v32i8_const(<32 x i8>* %a) nounwind { ; SSE2-LABEL: avg_v32i8_const: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm5 -; SSE2-NEXT: movdqa 16(%rdi), %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [5,6,7,8] -; SSE2-NEXT: paddd %xmm9, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,3,4] -; SSE2-NEXT: paddd %xmm3, %xmm7 -; SSE2-NEXT: paddd %xmm9, %xmm6 -; SSE2-NEXT: paddd %xmm3, %xmm4 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,2,3,4] +; SSE2-NEXT: paddd %xmm9, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,6,7,8] +; SSE2-NEXT: paddd %xmm4, %xmm8 ; SSE2-NEXT: paddd %xmm9, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm8 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm9, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm6 ; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm7 +; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm8 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: packuswb %xmm7, %xmm1 ; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm7 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm7 -; SSE2-NEXT: packuswb %xmm5, %xmm7 -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: packuswb %xmm6, %xmm4 -; SSE2-NEXT: packuswb %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: packuswb %xmm2, %xmm8 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: packuswb %xmm5, %xmm2 +; SSE2-NEXT: psrld $1, %xmm8 +; SSE2-NEXT: psrld $1, %xmm0 ; SSE2-NEXT: packuswb %xmm8, %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: movdqu %xmm4, (%rax) +; SSE2-NEXT: packuswb %xmm0, %xmm2 +; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i8_const: @@ -2277,9 +2268,9 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind { ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [5,6,7,8] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,2,3,4] ; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8] ; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm5 ; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4 @@ -2287,30 +2278,21 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind { ; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm8 +; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5 -; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6 -; AVX1-NEXT: vpsrld $1, %xmm9, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm1, %xmm7, %xmm7 -; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm6 -; AVX1-NEXT: vpackuswb %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm5 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1 -; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1 +; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2 +; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $1, %xmm4, %xmm1 +; AVX1-NEXT: vpsrld $1, %xmm5, %xmm2 +; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsrld $1, %xmm6, %xmm2 +; AVX1-NEXT: vpsrld $1, %xmm9, %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2567,49 +2549,40 @@ define void @avg_v64i8_const(<64 x i8>* %a) nounwind { ; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6 ; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 ; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm9 +; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 ; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm10 +; AVX2-NEXT: vpsrld $1, %ymm0, %ymm8 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm9, %ymm8 +; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 +; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 ; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 ; AVX2-NEXT: vpsrld $1, %ymm6, %ymm6 -; AVX2-NEXT: vpsrld $1, %ymm7, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-NEXT: vpackssdw %xmm7, %xmm2, %xmm7 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm7 -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm0 -; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; AVX2-NEXT: vpsrld $1, %ymm7, %ymm7 +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX2-NEXT: vpackssdw %xmm0, %xmm7, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-NEXT: vpackssdw %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpackuswb %xmm0, %xmm6, %xmm0 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; AVX2-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm8, %xmm4 -; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm10, %xmm4 -; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm8, %xmm3 +; AVX2-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll index 9f7f8a97dc2..c5f03dbd5a3 100644 --- a/test/CodeGen/X86/combine-srl.ll +++ b/test/CodeGen/X86/combine-srl.ll @@ -175,7 +175,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) { ; SSE: # BB#0: ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: packusdw %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_lshr_trunc_lshr0: diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index dc08d88074d..ac1083ad447 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -813,13 +813,10 @@ define void @trunc16i32_16i16_lshr(<16 x i32> %a) { ; ; AVX2-LABEL: trunc16i32_16i16_lshr: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -947,28 +944,52 @@ entry: } define void @trunc16i32_16i8_lshr(<16 x i32> %a) { -; SSE-LABEL: trunc16i32_16i8_lshr: -; SSE: # BB#0: # %entry -; SSE-NEXT: psrld $24, %xmm1 -; SSE-NEXT: psrld $24, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: psrld $24, %xmm3 -; SSE-NEXT: psrld $24, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqu %xmm0, (%rax) -; SSE-NEXT: retq +; SSE2-LABEL: trunc16i32_16i8_lshr: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: psrld $24, %xmm1 +; SSE2-NEXT: psrld $24, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: psrld $24, %xmm3 +; SSE2-NEXT: psrld $24, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i8_lshr: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: psrld $24, %xmm1 +; SSSE3-NEXT: psrld $24, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: psrld $24, %xmm3 +; SSSE3-NEXT: psrld $24, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i8_lshr: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: psrld $24, %xmm1 +; SSE41-NEXT: psrld $24, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: psrld $24, %xmm3 +; SSE41-NEXT: psrld $24, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc16i32_16i8_lshr: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -976,16 +997,12 @@ define void @trunc16i32_16i8_lshr(<16 x i32> %a) { ; ; AVX2-LABEL: trunc16i32_16i8_lshr: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq -- cgit v1.2.3 From 3d456013b6bbf241696e8bf1570502412e62a63c Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Fri, 3 Nov 2017 12:12:27 +0000 Subject: re-land [ExpandMemCmp] Split ExpandMemCmp from CodeGen into its own pass." Fix undefined references: ExpandMemCmp belongs to CodeGen/, not Scalar/. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317318 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/Passes.h | 3 + include/llvm/InitializePasses.h | 1 + include/llvm/LinkAllPasses.h | 1 + include/llvm/Transforms/Scalar.h | 2 +- lib/CodeGen/CMakeLists.txt | 1 + lib/CodeGen/CodeGen.cpp | 1 + lib/CodeGen/CodeGenPrepare.cpp | 710 --------------------- lib/CodeGen/ExpandMemCmp.cpp | 828 +++++++++++++++++++++++++ lib/CodeGen/TargetPassConfig.cpp | 10 +- test/CodeGen/Generic/llc-start-stop.ll | 6 +- test/CodeGen/X86/memcmp-optsize.ll | 224 ++++--- test/CodeGen/X86/memcmp.ll | 240 ++++--- test/Transforms/CodeGenPrepare/X86/memcmp.ll | 771 ----------------------- test/Transforms/ExpandMemCmp/X86/lit.local.cfg | 3 + test/Transforms/ExpandMemCmp/X86/memcmp.ll | 792 +++++++++++++++++++++++ tools/opt/opt.cpp | 1 + 16 files changed, 1872 insertions(+), 1722 deletions(-) create mode 100644 lib/CodeGen/ExpandMemCmp.cpp delete mode 100644 test/Transforms/CodeGenPrepare/X86/memcmp.ll create mode 100644 test/Transforms/ExpandMemCmp/X86/lit.local.cfg create mode 100644 test/Transforms/ExpandMemCmp/X86/memcmp.ll diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h index 8e6b1570e4a..c106ff6cdfe 100644 --- a/include/llvm/CodeGen/Passes.h +++ b/include/llvm/CodeGen/Passes.h @@ -417,6 +417,9 @@ namespace llvm { /// shuffles. FunctionPass *createExpandReductionsPass(); + // This pass expands memcmp() to load/stores. + FunctionPass *createExpandMemCmpPass(); + } // End llvm namespace #endif diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 8c63ab0284d..b8183d1c8e2 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -128,6 +128,7 @@ void initializeEdgeBundlesPass(PassRegistry&); void initializeEfficiencySanitizerPass(PassRegistry&); void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&); void initializeExpandISelPseudosPass(PassRegistry&); +void initializeExpandMemCmpPassPass(PassRegistry&); void initializeExpandPostRAPass(PassRegistry&); void initializeExpandReductionsPass(PassRegistry&); void initializeExternalAAWrapperPassPass(PassRegistry&); diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index 765e63926da..ce70f53ccb0 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -180,6 +180,7 @@ namespace { (void) llvm::createReversePostOrderFunctionAttrsPass(); (void) llvm::createMergeFunctionsPass(); (void) llvm::createMergeICmpsPass(); + (void) llvm::createExpandMemCmpPass(); std::string buf; llvm::raw_string_ostream os(buf); (void) llvm::createPrintModulePass(os); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index 8ef65774a93..a78c897683f 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -422,7 +422,7 @@ Pass *createLowerGuardIntrinsicPass(); //===----------------------------------------------------------------------===// // -// MergeICmps - Merge integer comparison chains +// MergeICmps - Merge integer comparison chains into a memcmp // Pass *createMergeICmpsPass(); diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 2e364cd4794..df04cf85049 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -21,6 +21,7 @@ add_llvm_library(LLVMCodeGen EdgeBundles.cpp ExecutionDepsFix.cpp ExpandISelPseudos.cpp + ExpandMemCmp.cpp ExpandPostRAPseudos.cpp ExpandReductions.cpp FaultMaps.cpp diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp index bfab865687e..2f119554a1e 100644 --- a/lib/CodeGen/CodeGen.cpp +++ b/lib/CodeGen/CodeGen.cpp @@ -30,6 +30,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeDwarfEHPreparePass(Registry); initializeEarlyIfConverterPass(Registry); initializeExpandISelPseudosPass(Registry); + initializeExpandMemCmpPassPass(Registry); initializeExpandPostRAPass(Registry); initializeFEntryInserterPass(Registry); initializeFinalizeMachineBundlesPass(Registry); diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 51f2a320b29..973ddebd987 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -123,12 +123,6 @@ STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); -STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); -STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); -STATISTIC(NumMemCmpGreaterThanMax, - "Number of memcmp calls with size greater than max size"); -STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); - static cl::opt DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); @@ -189,11 +183,6 @@ EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, cl::desc("Enable merging of redundant sexts when one is dominating" " the other."), cl::init(true)); -static cl::opt MemCmpNumLoadsPerBlock( - "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), - cl::desc("The number of loads per basic block for inline expansion of " - "memcmp that is only being compared against zero.")); - namespace { using SetOfInstrs = SmallPtrSet; @@ -1697,699 +1686,6 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, return true; } -namespace { - -// This class provides helper functions to expand a memcmp library call into an -// inline expansion. -class MemCmpExpansion { - struct ResultBlock { - BasicBlock *BB = nullptr; - PHINode *PhiSrc1 = nullptr; - PHINode *PhiSrc2 = nullptr; - - ResultBlock() = default; - }; - - CallInst *const CI; - ResultBlock ResBlock; - const uint64_t Size; - unsigned MaxLoadSize; - uint64_t NumLoadsNonOneByte; - const uint64_t NumLoadsPerBlock; - std::vector LoadCmpBlocks; - BasicBlock *EndBlock; - PHINode *PhiRes; - const bool IsUsedForZeroCmp; - const DataLayout &DL; - IRBuilder<> Builder; - // Represents the decomposition in blocks of the expansion. For example, - // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and - // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}. - // TODO(courbet): Involve the target more in this computation. On X86, 7 - // bytes can be done more efficiently with two overlaping 4-byte loads than - // covering the interval with [{4, 0},{2, 4},{1, 6}}. - struct LoadEntry { - LoadEntry(unsigned LoadSize, uint64_t Offset) - : LoadSize(LoadSize), Offset(Offset) { - assert(Offset % LoadSize == 0 && "invalid load entry"); - } - - uint64_t getGEPIndex() const { return Offset / LoadSize; } - - // The size of the load for this block, in bytes. - const unsigned LoadSize; - // The offset of this load WRT the base pointer, in bytes. - const uint64_t Offset; - }; - SmallVector LoadSequence; - - void createLoadCmpBlocks(); - void createResultBlock(); - void setupResultBlockPHINodes(); - void setupEndBlockPHINodes(); - Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex); - void emitLoadCompareBlock(unsigned BlockIndex); - void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, - unsigned &LoadIndex); - void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex); - void emitMemCmpResultBlock(); - Value *getMemCmpExpansionZeroCase(); - Value *getMemCmpEqZeroOneBlock(); - Value *getMemCmpOneBlock(); - - public: - MemCmpExpansion(CallInst *CI, uint64_t Size, - const TargetTransformInfo::MemCmpExpansionOptions &Options, - unsigned MaxNumLoads, const bool IsUsedForZeroCmp, - unsigned NumLoadsPerBlock, const DataLayout &DL); - - unsigned getNumBlocks(); - uint64_t getNumLoads() const { return LoadSequence.size(); } - - Value *getMemCmpExpansion(); -}; - -} // end anonymous namespace - -// Initialize the basic block structure required for expansion of memcmp call -// with given maximum load size and memcmp size parameter. -// This structure includes: -// 1. A list of load compare blocks - LoadCmpBlocks. -// 2. An EndBlock, split from original instruction point, which is the block to -// return from. -// 3. ResultBlock, block to branch to for early exit when a -// LoadCmpBlock finds a difference. -MemCmpExpansion::MemCmpExpansion( - CallInst *const CI, uint64_t Size, - const TargetTransformInfo::MemCmpExpansionOptions &Options, - const unsigned MaxNumLoads, const bool IsUsedForZeroCmp, - const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout) - : CI(CI), - Size(Size), - MaxLoadSize(0), - NumLoadsNonOneByte(0), - NumLoadsPerBlock(NumLoadsPerBlock), - IsUsedForZeroCmp(IsUsedForZeroCmp), - DL(TheDataLayout), - Builder(CI) { - assert(Size > 0 && "zero blocks"); - // Scale the max size down if the target can load more bytes than we need. - size_t LoadSizeIndex = 0; - while (LoadSizeIndex < Options.LoadSizes.size() && - Options.LoadSizes[LoadSizeIndex] > Size) { - ++LoadSizeIndex; - } - this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; - // Compute the decomposition. - uint64_t CurSize = Size; - uint64_t Offset = 0; - while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { - const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; - assert(LoadSize > 0 && "zero load size"); - const uint64_t NumLoadsForThisSize = CurSize / LoadSize; - if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { - // Do not expand if the total number of loads is larger than what the - // target allows. Note that it's important that we exit before completing - // the expansion to avoid using a ton of memory to store the expansion for - // large sizes. - LoadSequence.clear(); - return; - } - if (NumLoadsForThisSize > 0) { - for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { - LoadSequence.push_back({LoadSize, Offset}); - Offset += LoadSize; - } - if (LoadSize > 1) { - ++NumLoadsNonOneByte; - } - CurSize = CurSize % LoadSize; - } - ++LoadSizeIndex; - } - assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); -} - -unsigned MemCmpExpansion::getNumBlocks() { - if (IsUsedForZeroCmp) - return getNumLoads() / NumLoadsPerBlock + - (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0); - return getNumLoads(); -} - -void MemCmpExpansion::createLoadCmpBlocks() { - for (unsigned i = 0; i < getNumBlocks(); i++) { - BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", - EndBlock->getParent(), EndBlock); - LoadCmpBlocks.push_back(BB); - } -} - -void MemCmpExpansion::createResultBlock() { - ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", - EndBlock->getParent(), EndBlock); -} - -// This function creates the IR instructions for loading and comparing 1 byte. -// It loads 1 byte from each source of the memcmp parameters with the given -// GEPIndex. It then subtracts the two loaded values and adds this result to the -// final phi node for selecting the memcmp result. -void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, - unsigned GEPIndex) { - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using the GEPIndex. - if (GEPIndex != 0) { - Source1 = Builder.CreateGEP(LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, GEPIndex)); - Source2 = Builder.CreateGEP(LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, GEPIndex)); - } - - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); - Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); - - PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]); - - if (BlockIndex < (LoadCmpBlocks.size() - 1)) { - // Early exit branch if difference found to EndBlock. Otherwise, continue to - // next LoadCmpBlock, - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, - ConstantInt::get(Diff->getType(), 0)); - BranchInst *CmpBr = - BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp); - Builder.Insert(CmpBr); - } else { - // The last block has an unconditional branch to EndBlock. - BranchInst *CmpBr = BranchInst::Create(EndBlock); - Builder.Insert(CmpBr); - } -} - -/// Generate an equality comparison for one or more pairs of loaded values. -/// This is used in the case where the memcmp() call is compared equal or not -/// equal to zero. -Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex, - unsigned &LoadIndex) { - assert(LoadIndex < getNumLoads() && - "getCompareLoadPairs() called with no remaining loads"); - std::vector XorList, OrList; - Value *Diff; - - const unsigned NumLoads = - std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock); - - // For a single-block expansion, start inserting before the memcmp call. - if (LoadCmpBlocks.empty()) - Builder.SetInsertPoint(CI); - else - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - - Value *Cmp = nullptr; - // If we have multiple loads per block, we need to generate a composite - // comparison using xor+or. The type for the combinations is the largest load - // type. - IntegerType *const MaxLoadType = - NumLoads == 1 ? nullptr - : IntegerType::get(CI->getContext(), MaxLoadSize * 8); - for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) { - const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex]; - - IntegerType *LoadSizeType = - IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); - - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using a GEP. - if (CurLoadEntry.Offset != 0) { - Source1 = Builder.CreateGEP( - LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - Source2 = Builder.CreateGEP( - LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - } - - // Get a constant or load a value for each source address. - Value *LoadSrc1 = nullptr; - if (auto *Source1C = dyn_cast(Source1)) - LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL); - if (!LoadSrc1) - LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - - Value *LoadSrc2 = nullptr; - if (auto *Source2C = dyn_cast(Source2)) - LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL); - if (!LoadSrc2) - LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (NumLoads != 1) { - if (LoadSizeType != MaxLoadType) { - LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); - } - // If we have multiple loads per block, we need to generate a composite - // comparison using xor+or. - Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); - Diff = Builder.CreateZExt(Diff, MaxLoadType); - XorList.push_back(Diff); - } else { - // If there's only one load per block, we just compare the loaded values. - Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); - } - } - - auto pairWiseOr = [&](std::vector &InList) -> std::vector { - std::vector OutList; - for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { - Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); - OutList.push_back(Or); - } - if (InList.size() % 2 != 0) - OutList.push_back(InList.back()); - return OutList; - }; - - if (!Cmp) { - // Pairwise OR the XOR results. - OrList = pairWiseOr(XorList); - - // Pairwise OR the OR results until one result left. - while (OrList.size() != 1) { - OrList = pairWiseOr(OrList); - } - Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); - } - - return Cmp; -} - -void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, - unsigned &LoadIndex) { - Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex); - - BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) - ? EndBlock - : LoadCmpBlocks[BlockIndex + 1]; - // Early exit branch if difference found to ResultBlock. Otherwise, - // continue to next LoadCmpBlock or EndBlock. - BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); - Builder.Insert(CmpBr); - - // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 - // since early exit to ResultBlock was not taken (no difference was found in - // any of the bytes). - if (BlockIndex == LoadCmpBlocks.size() - 1) { - Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); - PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); - } -} - -// This function creates the IR intructions for loading and comparing using the -// given LoadSize. It loads the number of bytes specified by LoadSize from each -// source of the memcmp parameters. It then does a subtract to see if there was -// a difference in the loaded values. If a difference is found, it branches -// with an early exit to the ResultBlock for calculating which source was -// larger. Otherwise, it falls through to the either the next LoadCmpBlock or -// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with -// a special case through emitLoadCompareByteBlock. The special handling can -// simply subtract the loaded values and add it to the result phi node. -void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { - // There is one load per block in this case, BlockIndex == LoadIndex. - const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex]; - - if (CurLoadEntry.LoadSize == 1) { - MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, - CurLoadEntry.getGEPIndex()); - return; - } - - Type *LoadSizeType = - IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); - Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); - assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type"); - - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using a GEP. - if (CurLoadEntry.Offset != 0) { - Source1 = Builder.CreateGEP( - LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - Source2 = Builder.CreateGEP( - LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - } - - // Load LoadSizeType from the base address. - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (DL.isLittleEndian()) { - Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::bswap, LoadSizeType); - LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); - LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); - } - - if (LoadSizeType != MaxLoadType) { - LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); - } - - // Add the loaded values to the phi nodes for calculating memcmp result only - // if result is not used in a zero equality. - if (!IsUsedForZeroCmp) { - ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]); - ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]); - } - - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2); - BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) - ? EndBlock - : LoadCmpBlocks[BlockIndex + 1]; - // Early exit branch if difference found to ResultBlock. Otherwise, continue - // to next LoadCmpBlock or EndBlock. - BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); - Builder.Insert(CmpBr); - - // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 - // since early exit to ResultBlock was not taken (no difference was found in - // any of the bytes). - if (BlockIndex == LoadCmpBlocks.size() - 1) { - Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); - PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); - } -} - -// This function populates the ResultBlock with a sequence to calculate the -// memcmp result. It compares the two loaded source values and returns -1 if -// src1 < src2 and 1 if src1 > src2. -void MemCmpExpansion::emitMemCmpResultBlock() { - // Special case: if memcmp result is used in a zero equality, result does not - // need to be calculated and can simply return 1. - if (IsUsedForZeroCmp) { - BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); - Builder.SetInsertPoint(ResBlock.BB, InsertPt); - Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); - PhiRes->addIncoming(Res, ResBlock.BB); - BranchInst *NewBr = BranchInst::Create(EndBlock); - Builder.Insert(NewBr); - return; - } - BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); - Builder.SetInsertPoint(ResBlock.BB, InsertPt); - - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, - ResBlock.PhiSrc2); - - Value *Res = - Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), - ConstantInt::get(Builder.getInt32Ty(), 1)); - - BranchInst *NewBr = BranchInst::Create(EndBlock); - Builder.Insert(NewBr); - PhiRes->addIncoming(Res, ResBlock.BB); -} - -void MemCmpExpansion::setupResultBlockPHINodes() { - Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); - Builder.SetInsertPoint(ResBlock.BB); - // Note: this assumes one load per block. - ResBlock.PhiSrc1 = - Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1"); - ResBlock.PhiSrc2 = - Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2"); -} - -void MemCmpExpansion::setupEndBlockPHINodes() { - Builder.SetInsertPoint(&EndBlock->front()); - PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); -} - -Value *MemCmpExpansion::getMemCmpExpansionZeroCase() { - unsigned LoadIndex = 0; - // This loop populates each of the LoadCmpBlocks with the IR sequence to - // handle multiple loads per block. - for (unsigned I = 0; I < getNumBlocks(); ++I) { - emitLoadCompareBlockMultipleLoads(I, LoadIndex); - } - - emitMemCmpResultBlock(); - return PhiRes; -} - -/// A memcmp expansion that compares equality with 0 and only has one block of -/// load and compare can bypass the compare, branch, and phi IR that is required -/// in the general case. -Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() { - unsigned LoadIndex = 0; - Value *Cmp = getCompareLoadPairs(0, LoadIndex); - assert(LoadIndex == getNumLoads() && "some entries were not consumed"); - return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); -} - -/// A memcmp expansion that only has one block of load and compare can bypass -/// the compare, branch, and phi IR that is required in the general case. -Value *MemCmpExpansion::getMemCmpOneBlock() { - assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); - - Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Load LoadSizeType from the base address. - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (DL.isLittleEndian() && Size != 1) { - Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::bswap, LoadSizeType); - LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); - LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); - } - - if (Size < 4) { - // The i8 and i16 cases don't need compares. We zext the loaded values and - // subtract them to get the suitable negative, zero, or positive i32 result. - LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty()); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty()); - return Builder.CreateSub(LoadSrc1, LoadSrc2); - } - - // The result of memcmp is negative, zero, or positive, so produce that by - // subtracting 2 extended compare bits: sub (ugt, ult). - // If a target prefers to use selects to get -1/0/1, they should be able - // to transform this later. The inverse transform (going from selects to math) - // may not be possible in the DAG because the selects got converted into - // branches before we got there. - Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); - Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); - Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); - Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); - return Builder.CreateSub(ZextUGT, ZextULT); -} - -// This function expands the memcmp call into an inline expansion and returns -// the memcmp result. -Value *MemCmpExpansion::getMemCmpExpansion() { - // A memcmp with zero-comparison with only one block of load and compare does - // not need to set up any extra blocks. This case could be handled in the DAG, - // but since we have all of the machinery to flexibly expand any memcpy here, - // we choose to handle this case too to avoid fragmented lowering. - if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) { - BasicBlock *StartBlock = CI->getParent(); - EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); - setupEndBlockPHINodes(); - createResultBlock(); - - // If return value of memcmp is not used in a zero equality, we need to - // calculate which source was larger. The calculation requires the - // two loaded source values of each load compare block. - // These will be saved in the phi nodes created by setupResultBlockPHINodes. - if (!IsUsedForZeroCmp) setupResultBlockPHINodes(); - - // Create the number of required load compare basic blocks. - createLoadCmpBlocks(); - - // Update the terminator added by splitBasicBlock to branch to the first - // LoadCmpBlock. - StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); - } - - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - if (IsUsedForZeroCmp) - return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock() - : getMemCmpExpansionZeroCase(); - - // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). - if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock(); - - for (unsigned I = 0; I < getNumBlocks(); ++I) { - emitLoadCompareBlock(I); - } - - emitMemCmpResultBlock(); - return PhiRes; -} - -// This function checks to see if an expansion of memcmp can be generated. -// It checks for constant compare size that is less than the max inline size. -// If an expansion cannot occur, returns false to leave as a library call. -// Otherwise, the library call is replaced with a new IR instruction sequence. -/// We want to transform: -/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) -/// To: -/// loadbb: -/// %0 = bitcast i32* %buffer2 to i8* -/// %1 = bitcast i32* %buffer1 to i8* -/// %2 = bitcast i8* %1 to i64* -/// %3 = bitcast i8* %0 to i64* -/// %4 = load i64, i64* %2 -/// %5 = load i64, i64* %3 -/// %6 = call i64 @llvm.bswap.i64(i64 %4) -/// %7 = call i64 @llvm.bswap.i64(i64 %5) -/// %8 = sub i64 %6, %7 -/// %9 = icmp ne i64 %8, 0 -/// br i1 %9, label %res_block, label %loadbb1 -/// res_block: ; preds = %loadbb2, -/// %loadbb1, %loadbb -/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] -/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] -/// %10 = icmp ult i64 %phi.src1, %phi.src2 -/// %11 = select i1 %10, i32 -1, i32 1 -/// br label %endblock -/// loadbb1: ; preds = %loadbb -/// %12 = bitcast i32* %buffer2 to i8* -/// %13 = bitcast i32* %buffer1 to i8* -/// %14 = bitcast i8* %13 to i32* -/// %15 = bitcast i8* %12 to i32* -/// %16 = getelementptr i32, i32* %14, i32 2 -/// %17 = getelementptr i32, i32* %15, i32 2 -/// %18 = load i32, i32* %16 -/// %19 = load i32, i32* %17 -/// %20 = call i32 @llvm.bswap.i32(i32 %18) -/// %21 = call i32 @llvm.bswap.i32(i32 %19) -/// %22 = zext i32 %20 to i64 -/// %23 = zext i32 %21 to i64 -/// %24 = sub i64 %22, %23 -/// %25 = icmp ne i64 %24, 0 -/// br i1 %25, label %res_block, label %loadbb2 -/// loadbb2: ; preds = %loadbb1 -/// %26 = bitcast i32* %buffer2 to i8* -/// %27 = bitcast i32* %buffer1 to i8* -/// %28 = bitcast i8* %27 to i16* -/// %29 = bitcast i8* %26 to i16* -/// %30 = getelementptr i16, i16* %28, i16 6 -/// %31 = getelementptr i16, i16* %29, i16 6 -/// %32 = load i16, i16* %30 -/// %33 = load i16, i16* %31 -/// %34 = call i16 @llvm.bswap.i16(i16 %32) -/// %35 = call i16 @llvm.bswap.i16(i16 %33) -/// %36 = zext i16 %34 to i64 -/// %37 = zext i16 %35 to i64 -/// %38 = sub i64 %36, %37 -/// %39 = icmp ne i64 %38, 0 -/// br i1 %39, label %res_block, label %loadbb3 -/// loadbb3: ; preds = %loadbb2 -/// %40 = bitcast i32* %buffer2 to i8* -/// %41 = bitcast i32* %buffer1 to i8* -/// %42 = getelementptr i8, i8* %41, i8 14 -/// %43 = getelementptr i8, i8* %40, i8 14 -/// %44 = load i8, i8* %42 -/// %45 = load i8, i8* %43 -/// %46 = zext i8 %44 to i32 -/// %47 = zext i8 %45 to i32 -/// %48 = sub i32 %46, %47 -/// br label %endblock -/// endblock: ; preds = %res_block, -/// %loadbb3 -/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] -/// ret i32 %phi.res -static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, - const TargetLowering *TLI, const DataLayout *DL) { - NumMemCmpCalls++; - - // Early exit from expansion if -Oz. - if (CI->getFunction()->optForMinSize()) - return false; - - // Early exit from expansion if size is not a constant. - ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); - if (!SizeCast) { - NumMemCmpNotConstant++; - return false; - } - const uint64_t SizeVal = SizeCast->getZExtValue(); - - if (SizeVal == 0) { - return false; - } - - // TTI call to check if target would like to expand memcmp. Also, get the - // available load sizes. - const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); - const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp); - if (!Options) return false; - - const unsigned MaxNumLoads = - TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize()); - - MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads, - IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL); - - // Don't expand if this will require more loads than desired by the target. - if (Expansion.getNumLoads() == 0) { - NumMemCmpGreaterThanMax++; - return false; - } - - NumMemCmpInlined++; - - Value *Res = Expansion.getMemCmpExpansion(); - - // Replace call with result of expansion and erase call. - CI->replaceAllUsesWith(Res); - CI->eraseFromParent(); - - return true; -} - bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { BasicBlock *BB = CI->getParent(); @@ -2542,12 +1838,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { return true; } - LibFunc Func; - if (TLInfo->getLibFunc(ImmutableCallSite(CI), Func) && - Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TLI, DL)) { - ModifiedDT = true; - return true; - } return false; } diff --git a/lib/CodeGen/ExpandMemCmp.cpp b/lib/CodeGen/ExpandMemCmp.cpp new file mode 100644 index 00000000000..c5910c18d89 --- /dev/null +++ b/lib/CodeGen/ExpandMemCmp.cpp @@ -0,0 +1,828 @@ +//===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to partially inline the fast path of well-known library +// functions, such as using square-root instructions for cases where sqrt() +// does not need to set errno. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "expandmemcmp" + +STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); +STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); +STATISTIC(NumMemCmpGreaterThanMax, + "Number of memcmp calls with size greater than max size"); +STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); + +static cl::opt MemCmpNumLoadsPerBlock( + "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), + cl::desc("The number of loads per basic block for inline expansion of " + "memcmp that is only being compared against zero.")); + +namespace { + + +// This class provides helper functions to expand a memcmp library call into an +// inline expansion. +class MemCmpExpansion { + struct ResultBlock { + BasicBlock *BB = nullptr; + PHINode *PhiSrc1 = nullptr; + PHINode *PhiSrc2 = nullptr; + + ResultBlock() = default; + }; + + CallInst *const CI; + ResultBlock ResBlock; + const uint64_t Size; + unsigned MaxLoadSize; + uint64_t NumLoadsNonOneByte; + const uint64_t NumLoadsPerBlock; + std::vector LoadCmpBlocks; + BasicBlock *EndBlock; + PHINode *PhiRes; + const bool IsUsedForZeroCmp; + const DataLayout &DL; + IRBuilder<> Builder; + // Represents the decomposition in blocks of the expansion. For example, + // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and + // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}. + // TODO(courbet): Involve the target more in this computation. On X86, 7 + // bytes can be done more efficiently with two overlaping 4-byte loads than + // covering the interval with [{4, 0},{2, 4},{1, 6}}. + struct LoadEntry { + LoadEntry(unsigned LoadSize, uint64_t Offset) + : LoadSize(LoadSize), Offset(Offset) { + assert(Offset % LoadSize == 0 && "invalid load entry"); + } + + uint64_t getGEPIndex() const { return Offset / LoadSize; } + + // The size of the load for this block, in bytes. + const unsigned LoadSize; + // The offset of this load WRT the base pointer, in bytes. + const uint64_t Offset; + }; + SmallVector LoadSequence; + + void createLoadCmpBlocks(); + void createResultBlock(); + void setupResultBlockPHINodes(); + void setupEndBlockPHINodes(); + Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex); + void emitLoadCompareBlock(unsigned BlockIndex); + void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, + unsigned &LoadIndex); + void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex); + void emitMemCmpResultBlock(); + Value *getMemCmpExpansionZeroCase(); + Value *getMemCmpEqZeroOneBlock(); + Value *getMemCmpOneBlock(); + + public: + MemCmpExpansion(CallInst *CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + unsigned NumLoadsPerBlock, const DataLayout &DL); + + unsigned getNumBlocks(); + uint64_t getNumLoads() const { return LoadSequence.size(); } + + Value *getMemCmpExpansion(); +}; + +// Initialize the basic block structure required for expansion of memcmp call +// with given maximum load size and memcmp size parameter. +// This structure includes: +// 1. A list of load compare blocks - LoadCmpBlocks. +// 2. An EndBlock, split from original instruction point, which is the block to +// return from. +// 3. ResultBlock, block to branch to for early exit when a +// LoadCmpBlock finds a difference. +MemCmpExpansion::MemCmpExpansion( + CallInst *const CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + const unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout) + : CI(CI), + Size(Size), + MaxLoadSize(0), + NumLoadsNonOneByte(0), + NumLoadsPerBlock(NumLoadsPerBlock), + IsUsedForZeroCmp(IsUsedForZeroCmp), + DL(TheDataLayout), + Builder(CI) { + assert(Size > 0 && "zero blocks"); + // Scale the max size down if the target can load more bytes than we need. + size_t LoadSizeIndex = 0; + while (LoadSizeIndex < Options.LoadSizes.size() && + Options.LoadSizes[LoadSizeIndex] > Size) { + ++LoadSizeIndex; + } + this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; + // Compute the decomposition. + uint64_t CurSize = Size; + uint64_t Offset = 0; + while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { + const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; + assert(LoadSize > 0 && "zero load size"); + const uint64_t NumLoadsForThisSize = CurSize / LoadSize; + if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { + // Do not expand if the total number of loads is larger than what the + // target allows. Note that it's important that we exit before completing + // the expansion to avoid using a ton of memory to store the expansion for + // large sizes. + LoadSequence.clear(); + return; + } + if (NumLoadsForThisSize > 0) { + for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { + LoadSequence.push_back({LoadSize, Offset}); + Offset += LoadSize; + } + if (LoadSize > 1) { + ++NumLoadsNonOneByte; + } + CurSize = CurSize % LoadSize; + } + ++LoadSizeIndex; + } + assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); +} + +unsigned MemCmpExpansion::getNumBlocks() { + if (IsUsedForZeroCmp) + return getNumLoads() / NumLoadsPerBlock + + (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0); + return getNumLoads(); +} + +void MemCmpExpansion::createLoadCmpBlocks() { + for (unsigned i = 0; i < getNumBlocks(); i++) { + BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", + EndBlock->getParent(), EndBlock); + LoadCmpBlocks.push_back(BB); + } +} + +void MemCmpExpansion::createResultBlock() { + ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", + EndBlock->getParent(), EndBlock); +} + +// This function creates the IR instructions for loading and comparing 1 byte. +// It loads 1 byte from each source of the memcmp parameters with the given +// GEPIndex. It then subtracts the two loaded values and adds this result to the +// final phi node for selecting the memcmp result. +void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, + unsigned GEPIndex) { + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex. + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); + Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + + PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]); + + if (BlockIndex < (LoadCmpBlocks.size() - 1)) { + // Early exit branch if difference found to EndBlock. Otherwise, continue to + // next LoadCmpBlock, + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, + ConstantInt::get(Diff->getType(), 0)); + BranchInst *CmpBr = + BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp); + Builder.Insert(CmpBr); + } else { + // The last block has an unconditional branch to EndBlock. + BranchInst *CmpBr = BranchInst::Create(EndBlock); + Builder.Insert(CmpBr); + } +} + +/// Generate an equality comparison for one or more pairs of loaded values. +/// This is used in the case where the memcmp() call is compared equal or not +/// equal to zero. +Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex, + unsigned &LoadIndex) { + assert(LoadIndex < getNumLoads() && + "getCompareLoadPairs() called with no remaining loads"); + std::vector XorList, OrList; + Value *Diff; + + const unsigned NumLoads = + std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock); + + // For a single-block expansion, start inserting before the memcmp call. + if (LoadCmpBlocks.empty()) + Builder.SetInsertPoint(CI); + else + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + + Value *Cmp = nullptr; + // If we have multiple loads per block, we need to generate a composite + // comparison using xor+or. The type for the combinations is the largest load + // type. + IntegerType *const MaxLoadType = + NumLoads == 1 ? nullptr + : IntegerType::get(CI->getContext(), MaxLoadSize * 8); + for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) { + const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex]; + + IntegerType *LoadSizeType = + IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using a GEP. + if (CurLoadEntry.Offset != 0) { + Source1 = Builder.CreateGEP( + LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + Source2 = Builder.CreateGEP( + LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + } + + // Get a constant or load a value for each source address. + Value *LoadSrc1 = nullptr; + if (auto *Source1C = dyn_cast(Source1)) + LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL); + if (!LoadSrc1) + LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + + Value *LoadSrc2 = nullptr; + if (auto *Source2C = dyn_cast(Source2)) + LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL); + if (!LoadSrc2) + LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (NumLoads != 1) { + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); + } + // If we have multiple loads per block, we need to generate a composite + // comparison using xor+or. + Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); + Diff = Builder.CreateZExt(Diff, MaxLoadType); + XorList.push_back(Diff); + } else { + // If there's only one load per block, we just compare the loaded values. + Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); + } + } + + auto pairWiseOr = [&](std::vector &InList) -> std::vector { + std::vector OutList; + for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { + Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); + OutList.push_back(Or); + } + if (InList.size() % 2 != 0) + OutList.push_back(InList.back()); + return OutList; + }; + + if (!Cmp) { + // Pairwise OR the XOR results. + OrList = pairWiseOr(XorList); + + // Pairwise OR the OR results until one result left. + while (OrList.size() != 1) { + OrList = pairWiseOr(OrList); + } + Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); + } + + return Cmp; +} + +void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, + unsigned &LoadIndex) { + Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex); + + BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[BlockIndex + 1]; + // Early exit branch if difference found to ResultBlock. Otherwise, + // continue to next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes). + if (BlockIndex == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); + } +} + +// This function creates the IR intructions for loading and comparing using the +// given LoadSize. It loads the number of bytes specified by LoadSize from each +// source of the memcmp parameters. It then does a subtract to see if there was +// a difference in the loaded values. If a difference is found, it branches +// with an early exit to the ResultBlock for calculating which source was +// larger. Otherwise, it falls through to the either the next LoadCmpBlock or +// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with +// a special case through emitLoadCompareByteBlock. The special handling can +// simply subtract the loaded values and add it to the result phi node. +void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { + // There is one load per block in this case, BlockIndex == LoadIndex. + const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex]; + + if (CurLoadEntry.LoadSize == 1) { + MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, + CurLoadEntry.getGEPIndex()); + return; + } + + Type *LoadSizeType = + IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type"); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using a GEP. + if (CurLoadEntry.Offset != 0) { + Source1 = Builder.CreateGEP( + LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + Source2 = Builder.CreateGEP( + LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + } + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian()) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); + } + + // Add the loaded values to the phi nodes for calculating memcmp result only + // if result is not used in a zero equality. + if (!IsUsedForZeroCmp) { + ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]); + ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]); + } + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2); + BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[BlockIndex + 1]; + // Early exit branch if difference found to ResultBlock. Otherwise, continue + // to next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes). + if (BlockIndex == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); + } +} + +// This function populates the ResultBlock with a sequence to calculate the +// memcmp result. It compares the two loaded source values and returns -1 if +// src1 < src2 and 1 if src1 > src2. +void MemCmpExpansion::emitMemCmpResultBlock() { + // Special case: if memcmp result is used in a zero equality, result does not + // need to be calculated and can simply return 1. + if (IsUsedForZeroCmp) { + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); + PhiRes->addIncoming(Res, ResBlock.BB); + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + return; + } + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, + ResBlock.PhiSrc2); + + Value *Res = + Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), + ConstantInt::get(Builder.getInt32Ty(), 1)); + + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + PhiRes->addIncoming(Res, ResBlock.BB); +} + +void MemCmpExpansion::setupResultBlockPHINodes() { + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + Builder.SetInsertPoint(ResBlock.BB); + // Note: this assumes one load per block. + ResBlock.PhiSrc1 = + Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1"); + ResBlock.PhiSrc2 = + Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2"); +} + +void MemCmpExpansion::setupEndBlockPHINodes() { + Builder.SetInsertPoint(&EndBlock->front()); + PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); +} + +Value *MemCmpExpansion::getMemCmpExpansionZeroCase() { + unsigned LoadIndex = 0; + // This loop populates each of the LoadCmpBlocks with the IR sequence to + // handle multiple loads per block. + for (unsigned I = 0; I < getNumBlocks(); ++I) { + emitLoadCompareBlockMultipleLoads(I, LoadIndex); + } + + emitMemCmpResultBlock(); + return PhiRes; +} + +/// A memcmp expansion that compares equality with 0 and only has one block of +/// load and compare can bypass the compare, branch, and phi IR that is required +/// in the general case. +Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() { + unsigned LoadIndex = 0; + Value *Cmp = getCompareLoadPairs(0, LoadIndex); + assert(LoadIndex == getNumLoads() && "some entries were not consumed"); + return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); +} + +/// A memcmp expansion that only has one block of load and compare can bypass +/// the compare, branch, and phi IR that is required in the general case. +Value *MemCmpExpansion::getMemCmpOneBlock() { + assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); + + Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian() && Size != 1) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (Size < 4) { + // The i8 and i16 cases don't need compares. We zext the loaded values and + // subtract them to get the suitable negative, zero, or positive i32 result. + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty()); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty()); + return Builder.CreateSub(LoadSrc1, LoadSrc2); + } + + // The result of memcmp is negative, zero, or positive, so produce that by + // subtracting 2 extended compare bits: sub (ugt, ult). + // If a target prefers to use selects to get -1/0/1, they should be able + // to transform this later. The inverse transform (going from selects to math) + // may not be possible in the DAG because the selects got converted into + // branches before we got there. + Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); + Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); + Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); + Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); + return Builder.CreateSub(ZextUGT, ZextULT); +} + +// This function expands the memcmp call into an inline expansion and returns +// the memcmp result. +Value *MemCmpExpansion::getMemCmpExpansion() { + // A memcmp with zero-comparison with only one block of load and compare does + // not need to set up any extra blocks. This case could be handled in the DAG, + // but since we have all of the machinery to flexibly expand any memcpy here, + // we choose to handle this case too to avoid fragmented lowering. + if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) { + BasicBlock *StartBlock = CI->getParent(); + EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + setupEndBlockPHINodes(); + createResultBlock(); + + // If return value of memcmp is not used in a zero equality, we need to + // calculate which source was larger. The calculation requires the + // two loaded source values of each load compare block. + // These will be saved in the phi nodes created by setupResultBlockPHINodes. + if (!IsUsedForZeroCmp) setupResultBlockPHINodes(); + + // Create the number of required load compare basic blocks. + createLoadCmpBlocks(); + + // Update the terminator added by splitBasicBlock to branch to the first + // LoadCmpBlock. + StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); + } + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + if (IsUsedForZeroCmp) + return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock() + : getMemCmpExpansionZeroCase(); + + // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). + if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock(); + + for (unsigned I = 0; I < getNumBlocks(); ++I) { + emitLoadCompareBlock(I); + } + + emitMemCmpResultBlock(); + return PhiRes; +} + +// This function checks to see if an expansion of memcmp can be generated. +// It checks for constant compare size that is less than the max inline size. +// If an expansion cannot occur, returns false to leave as a library call. +// Otherwise, the library call is replaced with a new IR instruction sequence. +/// We want to transform: +/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) +/// To: +/// loadbb: +/// %0 = bitcast i32* %buffer2 to i8* +/// %1 = bitcast i32* %buffer1 to i8* +/// %2 = bitcast i8* %1 to i64* +/// %3 = bitcast i8* %0 to i64* +/// %4 = load i64, i64* %2 +/// %5 = load i64, i64* %3 +/// %6 = call i64 @llvm.bswap.i64(i64 %4) +/// %7 = call i64 @llvm.bswap.i64(i64 %5) +/// %8 = sub i64 %6, %7 +/// %9 = icmp ne i64 %8, 0 +/// br i1 %9, label %res_block, label %loadbb1 +/// res_block: ; preds = %loadbb2, +/// %loadbb1, %loadbb +/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] +/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] +/// %10 = icmp ult i64 %phi.src1, %phi.src2 +/// %11 = select i1 %10, i32 -1, i32 1 +/// br label %endblock +/// loadbb1: ; preds = %loadbb +/// %12 = bitcast i32* %buffer2 to i8* +/// %13 = bitcast i32* %buffer1 to i8* +/// %14 = bitcast i8* %13 to i32* +/// %15 = bitcast i8* %12 to i32* +/// %16 = getelementptr i32, i32* %14, i32 2 +/// %17 = getelementptr i32, i32* %15, i32 2 +/// %18 = load i32, i32* %16 +/// %19 = load i32, i32* %17 +/// %20 = call i32 @llvm.bswap.i32(i32 %18) +/// %21 = call i32 @llvm.bswap.i32(i32 %19) +/// %22 = zext i32 %20 to i64 +/// %23 = zext i32 %21 to i64 +/// %24 = sub i64 %22, %23 +/// %25 = icmp ne i64 %24, 0 +/// br i1 %25, label %res_block, label %loadbb2 +/// loadbb2: ; preds = %loadbb1 +/// %26 = bitcast i32* %buffer2 to i8* +/// %27 = bitcast i32* %buffer1 to i8* +/// %28 = bitcast i8* %27 to i16* +/// %29 = bitcast i8* %26 to i16* +/// %30 = getelementptr i16, i16* %28, i16 6 +/// %31 = getelementptr i16, i16* %29, i16 6 +/// %32 = load i16, i16* %30 +/// %33 = load i16, i16* %31 +/// %34 = call i16 @llvm.bswap.i16(i16 %32) +/// %35 = call i16 @llvm.bswap.i16(i16 %33) +/// %36 = zext i16 %34 to i64 +/// %37 = zext i16 %35 to i64 +/// %38 = sub i64 %36, %37 +/// %39 = icmp ne i64 %38, 0 +/// br i1 %39, label %res_block, label %loadbb3 +/// loadbb3: ; preds = %loadbb2 +/// %40 = bitcast i32* %buffer2 to i8* +/// %41 = bitcast i32* %buffer1 to i8* +/// %42 = getelementptr i8, i8* %41, i8 14 +/// %43 = getelementptr i8, i8* %40, i8 14 +/// %44 = load i8, i8* %42 +/// %45 = load i8, i8* %43 +/// %46 = zext i8 %44 to i32 +/// %47 = zext i8 %45 to i32 +/// %48 = sub i32 %46, %47 +/// br label %endblock +/// endblock: ; preds = %res_block, +/// %loadbb3 +/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] +/// ret i32 %phi.res +static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, + const TargetLowering *TLI, const DataLayout *DL) { + NumMemCmpCalls++; + + // Early exit from expansion if -Oz. + if (CI->getFunction()->optForMinSize()) + return false; + + // Early exit from expansion if size is not a constant. + ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); + if (!SizeCast) { + NumMemCmpNotConstant++; + return false; + } + const uint64_t SizeVal = SizeCast->getZExtValue(); + + if (SizeVal == 0) { + return false; + } + + // TTI call to check if target would like to expand memcmp. Also, get the + // available load sizes. + const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); + const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp); + if (!Options) return false; + + const unsigned MaxNumLoads = + TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize()); + + MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads, + IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL); + + // Don't expand if this will require more loads than desired by the target. + if (Expansion.getNumLoads() == 0) { + NumMemCmpGreaterThanMax++; + return false; + } + + NumMemCmpInlined++; + + Value *Res = Expansion.getMemCmpExpansion(); + + // Replace call with result of expansion and erase call. + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + + return true; +} + + + +class ExpandMemCmpPass : public FunctionPass { +public: + static char ID; + + ExpandMemCmpPass() : FunctionPass(ID) { + initializeExpandMemCmpPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) return false; + + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) { + return false; + } + const TargetLowering* TL = + TPC->getTM().getSubtargetImpl(F)->getTargetLowering(); + + const TargetLibraryInfo *TLI = + &getAnalysis().getTLI(); + const TargetTransformInfo *TTI = + &getAnalysis().getTTI(F); + auto PA = runImpl(F, TLI, TTI, TL); + return !PA.areAllPreserved(); + } + +private: + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + FunctionPass::getAnalysisUsage(AU); + } + + PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, + const TargetLowering* TL); + // Returns true if a change was made. + bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, const TargetLowering* TL, + const DataLayout& DL); +}; + +bool ExpandMemCmpPass::runOnBlock( + BasicBlock &BB, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, const TargetLowering* TL, + const DataLayout& DL) { + for (Instruction& I : BB) { + CallInst *CI = dyn_cast(&I); + if (!CI) { + continue; + } + LibFunc Func; + if (TLI->getLibFunc(ImmutableCallSite(CI), Func) && + Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TL, &DL)) { + return true; + } + } + return false; +} + + +PreservedAnalyses ExpandMemCmpPass::runImpl( + Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, + const TargetLowering* TL) { + const DataLayout& DL = F.getParent()->getDataLayout(); + bool MadeChanges = false; + for (auto BBIt = F.begin(); BBIt != F.end();) { + if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) { + MadeChanges = true; + // If changes were made, restart the function from the beginning, since + // the structure of the function was changed. + BBIt = F.begin(); + } else { + ++BBIt; + } + } + return MadeChanges ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} + +} // namespace + +char ExpandMemCmpPass::ID = 0; +INITIALIZE_PASS_BEGIN(ExpandMemCmpPass, "expandmemcmp", + "Expand memcmp() to load/stores", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp", + "Expand memcmp() to load/stores", false, false) + +FunctionPass *llvm::createExpandMemCmpPass() { + return new ExpandMemCmpPass(); +} diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index c5101b1ecfc..59e88ba3bda 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -600,8 +600,14 @@ void TargetPassConfig::addIRPasses() { addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n")); } - if (getOptLevel() != CodeGenOpt::None && EnableMergeICmps) { - addPass(createMergeICmpsPass()); + if (getOptLevel() != CodeGenOpt::None) { + // The MergeICmpsPass tries to create memcmp calls by grouping sequences of + // loads and compares. ExpandMemCmpPass then tries to expand those calls + // into optimally-sized loads and compares. The transforms are enabled by a + // target lowering hook. + if (EnableMergeICmps) + addPass(createMergeICmpsPass()); + addPass(createExpandMemCmpPass()); } // Run GC lowering passes for builtin collectors diff --git a/test/CodeGen/Generic/llc-start-stop.ll b/test/CodeGen/Generic/llc-start-stop.ll index 85b69c37aa0..9056e2cab49 100644 --- a/test/CodeGen/Generic/llc-start-stop.ll +++ b/test/CodeGen/Generic/llc-start-stop.ll @@ -13,15 +13,15 @@ ; STOP-BEFORE-NOT: Loop Strength Reduction ; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER -; START-AFTER: -machine-branch-prob -gc-lowering +; START-AFTER: -machine-branch-prob -expandmemcmp ; START-AFTER: FunctionPass Manager -; START-AFTER-NEXT: Lower Garbage Collection Instructions +; START-AFTER-NEXT: Expand memcmp() to load/stores ; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE ; START-BEFORE: -machine-branch-prob -domtree ; START-BEFORE: FunctionPass Manager ; START-BEFORE: Loop Strength Reduction -; START-BEFORE-NEXT: Lower Garbage Collection Instructions +; START-BEFORE-NEXT: Expand memcmp() to load/stores ; RUN: not llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE ; RUN: not llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll index 77d9fa69182..3f5eeba7055 100644 --- a/test/CodeGen/X86/memcmp-optsize.ll +++ b/test/CodeGen/X86/memcmp-optsize.ll @@ -156,36 +156,36 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize { define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length3_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: cmpw (%ecx), %dx -; X86-NEXT: jne .LBB5_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 2(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: cmpw (%eax), %dx +; X86-NEXT: jne .LBB5_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 2(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 2(%eax), %dl ; X86-NEXT: je .LBB5_3 -; X86-NEXT: .LBB5_1: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax +; X86-NEXT: .LBB5_2: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: .LBB5_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB5_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB5_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 2(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 2(%rsi), %cl ; X64-NEXT: je .LBB5_3 -; X64-NEXT: .LBB5_1: # %res_block +; X64-NEXT: .LBB5_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB5_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -314,36 +314,36 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize { define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length5_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB10_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 4(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB10_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 4(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 4(%eax), %dl ; X86-NEXT: je .LBB10_3 -; X86-NEXT: .LBB10_1: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax +; X86-NEXT: .LBB10_2: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: .LBB10_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB10_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB10_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 4(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 4(%rsi), %cl ; X64-NEXT: je .LBB10_3 -; X64-NEXT: .LBB10_1: # %res_block +; X64-NEXT: .LBB10_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB10_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -356,7 +356,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -365,8 +365,8 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB11_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB11_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx ; X86-NEXT: bswapl %ecx @@ -374,7 +374,7 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: je .LBB11_3 -; X86-NEXT: .LBB11_1: # %res_block +; X86-NEXT: .LBB11_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al @@ -400,22 +400,22 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB12_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB12_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 4(%eax), %edx ; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_1: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax +; X86-NEXT: .LBB12_2: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -432,15 +432,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq_const(i8* %X) nounwind optsize { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB13_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 ; X86-NEXT: je .LBB13_3 -; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: .LBB13_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: incl %eax ; X86-NEXT: .LBB13_3: # %endblock @@ -473,16 +473,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB14_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB14_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl 8(%rsi), %ecx ; X64-NEXT: je .LBB14_3 -; X64-NEXT: .LBB14_1: # %res_block +; X64-NEXT: .LBB14_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB14_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -505,28 +505,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB15_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB15_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB15_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB15_1: # %res_block +; X64-NEXT: je .LBB15_3 +; X64-NEXT: .LBB15_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB15_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m @@ -546,28 +545,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB16_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: je .LBB16_3 +; X64-NEXT: .LBB16_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m @@ -701,19 +699,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB20_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB20_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx ; X64-SSE2-NEXT: je .LBB20_3 -; X64-SSE2-NEXT: .LBB20_1: # %res_block +; X64-SSE2-NEXT: .LBB20_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB20_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -721,18 +719,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB20_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB20_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: movq 16(%rdi), %rcx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx ; X64-AVX2-NEXT: je .LBB20_3 -; X64-AVX2-NEXT: .LBB20_1: # %res_block +; X64-AVX2-NEXT: .LBB20_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB20_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -757,18 +755,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB21_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB21_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) ; X64-SSE2-NEXT: je .LBB21_3 -; X64-SSE2-NEXT: .LBB21_1: # %res_block +; X64-SSE2-NEXT: .LBB21_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB21_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -776,18 +774,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq_const: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB21_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB21_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi) ; X64-AVX2-NEXT: je .LBB21_3 -; X64-AVX2-NEXT: .LBB21_1: # %res_block +; X64-AVX2-NEXT: .LBB21_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB21_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -833,7 +831,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-NOSSE-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 @@ -841,8 +839,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB23_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB23_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -850,7 +848,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB23_3 -; X86-SSE2-NEXT: .LBB23_1: # %res_block +; X86-SSE2-NEXT: .LBB23_2: # %res_block ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: incl %eax ; X86-SSE2-NEXT: .LBB23_3: # %endblock @@ -859,14 +857,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB23_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -874,7 +872,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_1: # %res_block +; X64-SSE2-NEXT: .LBB23_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -909,21 +907,21 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize { ; X86-NOSSE-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB24_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB24_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB24_3 -; X86-SSE2-NEXT: .LBB24_1: # %res_block +; X86-SSE2-NEXT: .LBB24_2: # %res_block ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: incl %eax ; X86-SSE2-NEXT: .LBB24_3: # %endblock @@ -932,20 +930,20 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB24_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB24_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB24_3 -; X64-SSE2-NEXT: .LBB24_1: # %res_block +; X64-SSE2-NEXT: .LBB24_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB24_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -1009,20 +1007,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB26_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB26_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB26_3 -; X64-AVX2-NEXT: .LBB26_1: # %res_block +; X64-AVX2-NEXT: .LBB26_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB26_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -1059,20 +1057,20 @@ define i1 @length64_eq_const(i8* %X) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq_const: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB27_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB27_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB27_3 -; X64-AVX2-NEXT: .LBB27_1: # %res_block +; X64-AVX2-NEXT: .LBB27_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB27_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index 393e4c42d8b..84fd45b0a08 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -187,35 +187,35 @@ define i32 @length3(i8* %X, i8* %Y) nounwind { define i1 @length3_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length3_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: cmpw (%ecx), %dx -; X86-NEXT: jne .LBB7_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 2(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: cmpw (%eax), %dx +; X86-NEXT: jne .LBB7_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 2(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 2(%eax), %dl ; X86-NEXT: je .LBB7_3 -; X86-NEXT: .LBB7_1: # %res_block -; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB7_2: # %res_block +; X86-NEXT: movl $1, %ecx ; X86-NEXT: .LBB7_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB7_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB7_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 2(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 2(%rsi), %cl ; X64-NEXT: je .LBB7_3 -; X64-NEXT: .LBB7_1: # %res_block +; X64-NEXT: .LBB7_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB7_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -344,35 +344,35 @@ define i32 @length5(i8* %X, i8* %Y) nounwind { define i1 @length5_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length5_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB12_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 4(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB12_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 4(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 4(%eax), %dl ; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_1: # %res_block -; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB12_2: # %res_block +; X86-NEXT: movl $1, %ecx ; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB12_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB12_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 4(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 4(%rsi), %cl ; X64-NEXT: je .LBB12_3 -; X64-NEXT: .LBB12_1: # %res_block +; X64-NEXT: .LBB12_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB12_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -385,7 +385,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind { define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -394,23 +394,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB13_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # BB#3: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: je .LBB13_3 +; X86-NEXT: .LBB13_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al ; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB13_3: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -431,21 +429,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { define i1 @length8_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB14_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB14_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 4(%eax), %edx ; X86-NEXT: je .LBB14_3 -; X86-NEXT: .LBB14_1: # %res_block -; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB14_2: # %res_block +; X86-NEXT: movl $1, %ecx ; X86-NEXT: .LBB14_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -462,15 +460,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind { define i1 @length8_eq_const(i8* %X) nounwind { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB15_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB15_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 ; X86-NEXT: je .LBB15_3 -; X86-NEXT: .LBB15_1: # %res_block +; X86-NEXT: .LBB15_2: # %res_block ; X86-NEXT: movl $1, %eax ; X86-NEXT: .LBB15_3: # %endblock ; X86-NEXT: testl %eax, %eax @@ -502,16 +500,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB16_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl 8(%rsi), %ecx ; X64-NEXT: je .LBB16_3 -; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: .LBB16_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -534,28 +532,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB17_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB17_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB17_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB17_1: # %res_block +; X64-NEXT: je .LBB17_3 +; X64-NEXT: .LBB17_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB17_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m @@ -575,28 +572,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB18_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB18_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB18_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB18_1: # %res_block +; X64-NEXT: je .LBB18_3 +; X64-NEXT: .LBB18_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB18_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m @@ -754,19 +750,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB22_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB22_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx ; X64-SSE2-NEXT: je .LBB22_3 -; X64-SSE2-NEXT: .LBB22_1: # %res_block +; X64-SSE2-NEXT: .LBB22_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB22_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -774,18 +770,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq: -; X64-AVX: # BB#0: # %loadbb +; X64-AVX: # BB#0: ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB22_1 -; X64-AVX-NEXT: # BB#2: # %loadbb1 +; X64-AVX-NEXT: jne .LBB22_2 +; X64-AVX-NEXT: # BB#1: # %loadbb1 ; X64-AVX-NEXT: movq 16(%rdi), %rcx ; X64-AVX-NEXT: xorl %eax, %eax ; X64-AVX-NEXT: cmpq 16(%rsi), %rcx ; X64-AVX-NEXT: je .LBB22_3 -; X64-AVX-NEXT: .LBB22_1: # %res_block +; X64-AVX-NEXT: .LBB22_2: # %res_block ; X64-AVX-NEXT: movl $1, %eax ; X64-AVX-NEXT: .LBB22_3: # %endblock ; X64-AVX-NEXT: testl %eax, %eax @@ -810,18 +806,18 @@ define i1 @length24_eq_const(i8* %X) nounwind { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB23_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) ; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_1: # %res_block +; X64-SSE2-NEXT: .LBB23_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -829,18 +825,18 @@ define i1 @length24_eq_const(i8* %X) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq_const: -; X64-AVX: # BB#0: # %loadbb +; X64-AVX: # BB#0: ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB23_1 -; X64-AVX-NEXT: # BB#2: # %loadbb1 +; X64-AVX-NEXT: jne .LBB23_2 +; X64-AVX-NEXT: # BB#1: # %loadbb1 ; X64-AVX-NEXT: xorl %eax, %eax ; X64-AVX-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-AVX-NEXT: cmpq %rcx, 16(%rdi) ; X64-AVX-NEXT: je .LBB23_3 -; X64-AVX-NEXT: .LBB23_1: # %res_block +; X64-AVX-NEXT: .LBB23_2: # %res_block ; X64-AVX-NEXT: movl $1, %eax ; X64-AVX-NEXT: .LBB23_3: # %endblock ; X64-AVX-NEXT: testl %eax, %eax @@ -898,7 +894,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 @@ -906,8 +902,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB25_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB25_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -915,7 +911,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB25_3 -; X86-SSE2-NEXT: .LBB25_1: # %res_block +; X86-SSE2-NEXT: .LBB25_2: # %res_block ; X86-SSE2-NEXT: movl $1, %eax ; X86-SSE2-NEXT: .LBB25_3: # %endblock ; X86-SSE2-NEXT: testl %eax, %eax @@ -923,14 +919,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB25_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB25_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -938,7 +934,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB25_3 -; X64-SSE2-NEXT: .LBB25_1: # %res_block +; X64-SSE2-NEXT: .LBB25_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB25_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -946,20 +942,20 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq: -; X64-AVX1: # BB#0: # %loadbb +; X64-AVX1: # BB#0: ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB25_1 -; X64-AVX1-NEXT: # BB#2: # %loadbb1 +; X64-AVX1-NEXT: jne .LBB25_2 +; X64-AVX1-NEXT: # BB#1: # %loadbb1 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-AVX1-NEXT: je .LBB25_3 -; X64-AVX1-NEXT: .LBB25_1: # %res_block +; X64-AVX1-NEXT: .LBB25_2: # %res_block ; X64-AVX1-NEXT: movl $1, %eax ; X64-AVX1-NEXT: .LBB25_3: # %endblock ; X64-AVX1-NEXT: testl %eax, %eax @@ -1006,21 +1002,21 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB26_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB26_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB26_3 -; X86-SSE2-NEXT: .LBB26_1: # %res_block +; X86-SSE2-NEXT: .LBB26_2: # %res_block ; X86-SSE2-NEXT: movl $1, %eax ; X86-SSE2-NEXT: .LBB26_3: # %endblock ; X86-SSE2-NEXT: testl %eax, %eax @@ -1028,20 +1024,20 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB26_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB26_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB26_3 -; X64-SSE2-NEXT: .LBB26_1: # %res_block +; X64-SSE2-NEXT: .LBB26_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB26_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -1049,20 +1045,20 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq_const: -; X64-AVX1: # BB#0: # %loadbb +; X64-AVX1: # BB#0: ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB26_1 -; X64-AVX1-NEXT: # BB#2: # %loadbb1 +; X64-AVX1-NEXT: jne .LBB26_2 +; X64-AVX1-NEXT: # BB#1: # %loadbb1 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-AVX1-NEXT: je .LBB26_3 -; X64-AVX1-NEXT: .LBB26_1: # %res_block +; X64-AVX1-NEXT: .LBB26_2: # %res_block ; X64-AVX1-NEXT: movl $1, %eax ; X64-AVX1-NEXT: .LBB26_3: # %endblock ; X64-AVX1-NEXT: testl %eax, %eax @@ -1136,20 +1132,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind { ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB28_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB28_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB28_3 -; X64-AVX2-NEXT: .LBB28_1: # %res_block +; X64-AVX2-NEXT: .LBB28_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB28_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -1197,20 +1193,20 @@ define i1 @length64_eq_const(i8* %X) nounwind { ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq_const: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB29_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB29_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB29_3 -; X64-AVX2-NEXT: .LBB29_1: # %res_block +; X64-AVX2-NEXT: .LBB29_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB29_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/CodeGenPrepare/X86/memcmp.ll deleted file mode 100644 index a4f635c956d..00000000000 --- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll +++ /dev/null @@ -1,771 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -codegenprepare -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32 -; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 - -declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) - -define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp2( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]]) -; ALL-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 -; ALL-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32 -; ALL-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] -; ALL-NEXT: ret i32 [[TMP9]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) - ret i32 %call -} - -define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp3( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) -; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = icmp eq i16 [[TMP4]], [[TMP5]] -; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; ALL: res_block: -; ALL-NEXT: [[TMP7:%.*]] = icmp ult i16 [[TMP4]], [[TMP5]] -; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 2 -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 2 -; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] -; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 -; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] -; ALL-NEXT: br label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; ALL-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) - ret i32 %call -} - -define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp4( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) -; ALL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] -; ALL-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; ALL-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; ALL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] -; ALL-NEXT: ret i32 [[TMP11]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) - ret i32 %call -} - -define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp5( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] -; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; ALL: res_block: -; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]] -; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] -; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 -; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] -; ALL-NEXT: br label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; ALL-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) - ret i32 %call -} - -define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp6( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] -; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; ALL: res_block: -; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] -; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* -; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* -; ALL-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2 -; ALL-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 -; ALL-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] -; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] -; ALL-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) -; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) -; ALL-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i32 -; ALL-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 -; ALL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]] -; ALL-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; ALL-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) - ret i32 %call -} - -define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp7( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) -; ALL-NEXT: ret i32 [[CALL]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) - ret i32 %call -} - -define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp8( -; X32-NEXT: loadbb: -; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] -; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; X32: res_block: -; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] -; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] -; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; X32-NEXT: br label [[ENDBLOCK:%.*]] -; X32: loadbb1: -; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* -; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* -; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 -; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 -; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] -; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] -; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) -; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) -; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] -; X32-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; X32: endblock: -; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; X32-NEXT: ret i32 [[PHI_RES]] -; -; X64-LABEL: @cmp8( -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]]) -; X64-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]] -; X64-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]] -; X64-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; X64-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; X64-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] -; X64-NEXT: ret i32 [[TMP11]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) - ret i32 %call -} - -define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp9( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) -; X32-NEXT: ret i32 [[CALL]] -; -; X64-LABEL: @cmp9( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; X64: res_block: -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] -; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; X64-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 -; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; X64-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] -; X64-NEXT: br label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; X64-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) - ret i32 %call -} - -define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp10( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) -; X32-NEXT: ret i32 [[CALL]] -; -; X64-LABEL: @cmp10( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* -; X64-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4 -; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 -; X64-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] -; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) -; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) -; X64-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i64 -; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64 -; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] -; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; X64-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) - ret i32 %call -} - -define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp11( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; ALL-NEXT: ret i32 [[CALL]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) - ret i32 %call -} - -define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp12( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) -; X32-NEXT: ret i32 [[CALL]] -; -; X64-LABEL: @cmp12( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* -; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 -; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 -; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] -; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) -; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) -; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 -; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 -; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] -; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; X64-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) - ret i32 %call -} - -define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp13( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; ALL-NEXT: ret i32 [[CALL]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) - ret i32 %call -} - -define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp14( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; ALL-NEXT: ret i32 [[CALL]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) - ret i32 %call -} - -define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp15( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; ALL-NEXT: ret i32 [[CALL]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) - ret i32 %call -} - -define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp16( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) -; X32-NEXT: ret i32 [[CALL]] -; -; X64-LABEL: @cmp16( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i64* -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i64* -; X64-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 1 -; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1 -; X64-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP11]] -; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] -; X64-NEXT: [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]]) -; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) -; X64-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP16]] -; X64-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] -; X64-NEXT: ret i32 [[PHI_RES]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) - ret i32 %call -} - -define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq2( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] -; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq3( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = icmp ne i16 [[TMP2]], [[TMP3]] -; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; ALL: res_block: -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 2 -; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 2 -; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] -; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq4( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] -; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq5( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] -; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; ALL: res_block: -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] -; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq6( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] -; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; ALL: res_block: -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* -; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* -; ALL-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2 -; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 -; ALL-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] -; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] -; ALL-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] -; ALL-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq7( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq8( -; X32-NEXT: loadbb: -; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] -; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X32: res_block: -; X32-NEXT: br label [[ENDBLOCK:%.*]] -; X32: loadbb1: -; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* -; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* -; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 -; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 -; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] -; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] -; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] -; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X32: endblock: -; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64-LABEL: @cmp_eq8( -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq9( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64-LABEL: @cmp_eq9( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] -; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64: res_block: -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] -; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; X64-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] -; X64-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq10( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64-LABEL: @cmp_eq10( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] -; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64: res_block: -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* -; X64-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4 -; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 -; X64-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] -; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] -; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq11( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq12( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64-LABEL: @cmp_eq12( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] -; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64: res_block: -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* -; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 -; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 -; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] -; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] -; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq13( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq14( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq15( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - -define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq16( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; -; X64-LABEL: @cmp_eq16( -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i128* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i128* -; X64-NEXT: [[TMP3:%.*]] = load i128, i128* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i128, i128* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = icmp ne i128 [[TMP3]], [[TMP4]] -; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 -; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X64-NEXT: ret i32 [[CONV]] -; - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) - %cmp = icmp eq i32 %call, 0 - %conv = zext i1 %cmp to i32 - ret i32 %conv -} - diff --git a/test/Transforms/ExpandMemCmp/X86/lit.local.cfg b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg new file mode 100644 index 00000000000..e71f3cc4c41 --- /dev/null +++ b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'X86' in config.root.targets: + config.unsupported = True + diff --git a/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/test/Transforms/ExpandMemCmp/X86/memcmp.ll new file mode 100644 index 00000000000..1abfb20f369 --- /dev/null +++ b/test/Transforms/ExpandMemCmp/X86/memcmp.ll @@ -0,0 +1,792 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -expandmemcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32 +; RUN: opt -S -expandmemcmp -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 + +declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) + +define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp2( +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; ALL-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]]) +; ALL-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 +; ALL-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32 +; ALL-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] +; ALL-NEXT: ret i32 [[TMP9]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) + ret i32 %call +} + +define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp3( +; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL: res_block: +; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i16 [ [[TMP7:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i16 [ [[TMP8:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[TMP1:%.*]] = icmp ult i16 [[PHI_SRC1]], [[PHI_SRC2]] +; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]] +; ALL-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP4]] +; ALL-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]]) +; ALL-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]]) +; ALL-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] +; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; ALL-NEXT: br label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; ALL-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) + ret i32 %call +} + +define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp4( +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; ALL-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) +; ALL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] +; ALL-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 +; ALL-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 +; ALL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; ALL-NEXT: ret i32 [[TMP11]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) + ret i32 %call +} + +define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp5( +; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL: res_block: +; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] +; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] +; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] +; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; ALL-NEXT: br label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; ALL-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) + ret i32 %call +} + +define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp6( +; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL: res_block: +; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] +; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] +; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] +; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 +; ALL-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2 +; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; ALL-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] +; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; ALL-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +; ALL-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 +; ALL-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i32 +; ALL-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]] +; ALL-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; ALL-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) + ret i32 %call +} + +define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp7( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) +; ALL-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) + ret i32 %call +} + +define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp8( +; X32-NEXT: br label [[LOADBB:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb: +; X32-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] +; X32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] +; X32-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; X32-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1 +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) +; X32-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp8( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]]) +; X64-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]] +; X64-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]] +; X64-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 +; X64-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 +; X64-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; X64-NEXT: ret i32 [[TMP11]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) + ret i32 %call +} + +define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp9( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp9( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] +; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X64-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) + ret i32 %call +} + +define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp10( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp10( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 +; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 4 +; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64 +; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64 +; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]] +; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) + ret i32 %call +} + +define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp11( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) +; ALL-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) + ret i32 %call +} + +define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp12( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp12( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 2 +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) +; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 +; X64-NEXT: [[TMP19]] = zext i32 [[TMP17]] to i64 +; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]] +; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) + ret i32 %call +} + +define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp13( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) +; ALL-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) + ret i32 %call +} + +define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp14( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) +; ALL-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) + ret i32 %call +} + +define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp15( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) +; ALL-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) + ret i32 %call +} + +define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp16( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp16( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i64* +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i64* +; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1 +; X64-NEXT: [[TMP13:%.*]] = getelementptr i64, i64* [[TMP11]], i64 1 +; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]] +; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) +; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]]) +; X64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]] +; X64-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) + ret i32 %call +} + +define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq2( +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] +; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq3( +; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] +; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] +; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq4( +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq5( +; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] +; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq6( +; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 +; ALL-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 +; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; ALL-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] +; ALL-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] +; ALL-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq7( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq8( +; X32-NEXT: br label [[LOADBB:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb: +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; X32-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; X32-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 1 +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] +; X32-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] +; X32-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq8( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq9( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq9( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; X64-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] +; X64-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq10( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq10( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 +; X64-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4 +; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] +; X64-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] +; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq11( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq12( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq12( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2 +; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] +; X64-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] +; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq13( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq14( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; ALL-LABEL: @cmp_eq15( +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; ALL-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq16( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq16( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i128* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i128* +; X64-NEXT: [[TMP3:%.*]] = load i128, i128* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i128, i128* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i128 [[TMP3]], [[TMP4]] +; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp index e2fdfe82b8c..0371cd0372f 100644 --- a/tools/opt/opt.cpp +++ b/tools/opt/opt.cpp @@ -391,6 +391,7 @@ int main(int argc, char **argv) { initializeTarget(Registry); // For codegen passes, only passes that do IR to IR transformation are // supported. + initializeExpandMemCmpPassPass(Registry); initializeScalarizeMaskedMemIntrinPass(Registry); initializeCodeGenPreparePass(Registry); initializeAtomicExpandPass(Registry); -- cgit v1.2.3 From 9e5188ca177ad10813f233bd693a57d73a90b86b Mon Sep 17 00:00:00 2001 From: Mikael Holmen Date: Fri, 3 Nov 2017 14:15:08 +0000 Subject: [ADCE] Use MapVector for BlockInfo to make iteration order deterministic Summary: Also added a reserve() method to MapVector since we want to use that from ADCE. DenseMap does not provide deterministic iteration order so with that we will handle the members of BlockInfo in random order, eventually leading to random order of the blocks in the predecessor lists. Without this change, I get the same predecessor order in about 90% of the time when I compile a certain reproducer and in 10% I get a different one. No idea how to make a proper test case for this. Reviewers: kuhar, david2050 Reviewed By: kuhar Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D39593 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317323 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ADT/MapVector.h | 7 +++++++ lib/Transforms/Scalar/ADCE.cpp | 4 +++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/llvm/ADT/MapVector.h b/include/llvm/ADT/MapVector.h index 26a555ee1d3..3d78f4b203c 100644 --- a/include/llvm/ADT/MapVector.h +++ b/include/llvm/ADT/MapVector.h @@ -56,6 +56,13 @@ public: size_type size() const { return Vector.size(); } + /// Grow the MapVector so that it can contain at least \p NumEntries items + /// before resizing again. + void reserve(size_type NumEntries) { + Map.reserve(NumEntries); + Vector.reserve(NumEntries); + } + iterator begin() { return Vector.begin(); } const_iterator begin() const { return Vector.begin(); } iterator end() { return Vector.end(); } diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp index f04d0f05ffc..1e683db5020 100644 --- a/lib/Transforms/Scalar/ADCE.cpp +++ b/lib/Transforms/Scalar/ADCE.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -118,7 +119,8 @@ class AggressiveDeadCodeElimination { PostDominatorTree &PDT; /// Mapping of blocks to associated information, an element in BlockInfoVec. - DenseMap BlockInfo; + /// Use MapVector to get deterministic iteration order. + MapVector BlockInfo; bool isLive(BasicBlock *BB) { return BlockInfo[BB].Live; } /// Mapping of instructions to associated information. -- cgit v1.2.3 From c9ed638d21437f805b89f805252ec78a59b22f96 Mon Sep 17 00:00:00 2001 From: Anna Thomas Date: Fri, 3 Nov 2017 14:25:39 +0000 Subject: [LoopPredication] NFC: Refactored code to separate out functions being reused Summary: Refactored the code to separate out common functions that are being reused. This is to reduce the changes for changes coming up wrt loop predication with reverse loops. This refactoring is what we have in our downstream code. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317324 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/LoopPredication.cpp | 154 ++++++++++++++++++------------ 1 file changed, 92 insertions(+), 62 deletions(-) diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp index e680fbed113..52dea3254e7 100644 --- a/lib/Transforms/Scalar/LoopPredication.cpp +++ b/lib/Transforms/Scalar/LoopPredication.cpp @@ -189,6 +189,10 @@ class LoopPredication { const SCEV *Limit) : Pred(Pred), IV(IV), Limit(Limit) {} LoopICmp() {} + void dump() { + dbgs() << "LoopICmp Pred = " << Pred << ", IV = " << *IV + << ", Limit = " << *Limit << "\n"; + } }; ScalarEvolution *SE; @@ -198,6 +202,7 @@ class LoopPredication { BasicBlock *Preheader; LoopICmp LatchCheck; + bool isSupportedStep(const SCEV* Step); Optional parseLoopICmp(ICmpInst *ICI) { return parseLoopICmp(ICI->getPredicate(), ICI->getOperand(0), ICI->getOperand(1)); @@ -207,12 +212,18 @@ class LoopPredication { Optional parseLoopLatchICmp(); + bool CanExpand(const SCEV* S); Value *expandCheck(SCEVExpander &Expander, IRBuilder<> &Builder, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, Instruction *InsertAt); Optional widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander, IRBuilder<> &Builder); + Optional widenICmpRangeCheckIncrementingLoop(LoopICmp LatchCheck, + LoopICmp RangeCheck, + SCEVExpander &Expander, + IRBuilder<> &Builder); + bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander); // When the IV type is wider than the range operand type, we can still do loop @@ -348,6 +359,67 @@ LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) { return NewLatchCheck; } +bool LoopPredication::isSupportedStep(const SCEV* Step) { + return Step->isOne(); +} + +bool LoopPredication::CanExpand(const SCEV* S) { + return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE); +} + +Optional LoopPredication::widenICmpRangeCheckIncrementingLoop( + LoopPredication::LoopICmp LatchCheck, LoopPredication::LoopICmp RangeCheck, + SCEVExpander &Expander, IRBuilder<> &Builder) { + auto *Ty = RangeCheck.IV->getType(); + // Generate the widened condition for the forward loop: + // guardStart u< guardLimit && + // latchLimit guardLimit - 1 - guardStart + latchStart + // where depends on the latch condition predicate. See the file + // header comment for the reasoning. + // guardLimit - guardStart + latchStart - 1 + const SCEV *GuardStart = RangeCheck.IV->getStart(); + const SCEV *GuardLimit = RangeCheck.Limit; + const SCEV *LatchStart = LatchCheck.IV->getStart(); + const SCEV *LatchLimit = LatchCheck.Limit; + + // guardLimit - guardStart + latchStart - 1 + const SCEV *RHS = + SE->getAddExpr(SE->getMinusSCEV(GuardLimit, GuardStart), + SE->getMinusSCEV(LatchStart, SE->getOne(Ty))); + if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) || + !CanExpand(LatchLimit) || !CanExpand(RHS)) { + DEBUG(dbgs() << "Can't expand limit check!\n"); + return None; + } + ICmpInst::Predicate LimitCheckPred; + switch (LatchCheck.Pred) { + case ICmpInst::ICMP_ULT: + LimitCheckPred = ICmpInst::ICMP_ULE; + break; + case ICmpInst::ICMP_ULE: + LimitCheckPred = ICmpInst::ICMP_ULT; + break; + case ICmpInst::ICMP_SLT: + LimitCheckPred = ICmpInst::ICMP_SLE; + break; + case ICmpInst::ICMP_SLE: + LimitCheckPred = ICmpInst::ICMP_SLT; + break; + default: + llvm_unreachable("Unsupported loop latch!"); + } + + DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n"); + DEBUG(dbgs() << "RHS: " << *RHS << "\n"); + DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n"); + + Instruction *InsertAt = Preheader->getTerminator(); + auto *LimitCheck = + expandCheck(Expander, Builder, LimitCheckPred, LatchLimit, RHS, InsertAt); + auto *FirstIterationCheck = expandCheck(Expander, Builder, RangeCheck.Pred, + GuardStart, GuardLimit, InsertAt); + return Builder.CreateAnd(FirstIterationCheck, LimitCheck); +} /// If ICI can be widened to a loop invariant condition emits the loop /// invariant condition in the loop preheader and return it, otherwise /// returns None. @@ -366,6 +438,8 @@ Optional LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, DEBUG(dbgs() << "Failed to parse the loop latch condition!\n"); return None; } + DEBUG(dbgs() << "Guard check:\n"); + DEBUG(RangeCheck->dump()); if (RangeCheck->Pred != ICmpInst::ICMP_ULT) { DEBUG(dbgs() << "Unsupported range check predicate(" << RangeCheck->Pred << ")!\n"); @@ -379,7 +453,7 @@ Optional LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, auto *Step = RangeCheckIV->getStepRecurrence(*SE); // We cannot just compare with latch IV step because the latch and range IVs // may have different types. - if (!Step->isOne()) { + if (!isSupportedStep(Step)) { DEBUG(dbgs() << "Range check and latch have IVs different steps!\n"); return None; } @@ -397,58 +471,9 @@ Optional LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, // value and type. assert(Step == CurrLatchCheck.IV->getStepRecurrence(*SE) && "Range and latch should have same step recurrence!"); - // Generate the widened condition: - // guardStart u< guardLimit && - // latchLimit guardLimit - 1 - guardStart + latchStart - // where depends on the latch condition predicate. See the file - // header comment for the reasoning. - const SCEV *GuardStart = RangeCheckIV->getStart(); - const SCEV *GuardLimit = RangeCheck->Limit; - const SCEV *LatchStart = CurrLatchCheck.IV->getStart(); - const SCEV *LatchLimit = CurrLatchCheck.Limit; - - // guardLimit - guardStart + latchStart - 1 - const SCEV *RHS = - SE->getAddExpr(SE->getMinusSCEV(GuardLimit, GuardStart), - SE->getMinusSCEV(LatchStart, SE->getOne(Ty))); - - ICmpInst::Predicate LimitCheckPred; - switch (CurrLatchCheck.Pred) { - case ICmpInst::ICMP_ULT: - LimitCheckPred = ICmpInst::ICMP_ULE; - break; - case ICmpInst::ICMP_ULE: - LimitCheckPred = ICmpInst::ICMP_ULT; - break; - case ICmpInst::ICMP_SLT: - LimitCheckPred = ICmpInst::ICMP_SLE; - break; - case ICmpInst::ICMP_SLE: - LimitCheckPred = ICmpInst::ICMP_SLT; - break; - default: - llvm_unreachable("Unsupported loop latch!"); - } - DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n"); - DEBUG(dbgs() << "RHS: " << *RHS << "\n"); - DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n"); - - auto CanExpand = [this](const SCEV *S) { - return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE); - }; - if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) || - !CanExpand(LatchLimit) || !CanExpand(RHS)) { - DEBUG(dbgs() << "Can't expand limit check!\n"); - return None; - } - - Instruction *InsertAt = Preheader->getTerminator(); - auto *LimitCheck = - expandCheck(Expander, Builder, LimitCheckPred, LatchLimit, RHS, InsertAt); - auto *FirstIterationCheck = expandCheck(Expander, Builder, RangeCheck->Pred, - GuardStart, GuardLimit, InsertAt); - return Builder.CreateAnd(FirstIterationCheck, LimitCheck); + return widenICmpRangeCheckIncrementingLoop(CurrLatchCheck, *RangeCheck, + Expander, Builder); } bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard, @@ -541,15 +566,6 @@ Optional LoopPredication::parseLoopLatchICmp() { return None; } - if (Result->Pred != ICmpInst::ICMP_ULT && - Result->Pred != ICmpInst::ICMP_SLT && - Result->Pred != ICmpInst::ICMP_ULE && - Result->Pred != ICmpInst::ICMP_SLE) { - DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred - << ")!\n"); - return None; - } - // Check affine first, so if it's not we don't try to compute the step // recurrence. if (!Result->IV->isAffine()) { @@ -558,11 +574,22 @@ Optional LoopPredication::parseLoopLatchICmp() { } auto *Step = Result->IV->getStepRecurrence(*SE); - if (!Step->isOne()) { + if (!isSupportedStep(Step)) { DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n"); return None; } + auto IsUnsupportedPredicate = [](const SCEV *Step, ICmpInst::Predicate Pred) { + assert(Step->isOne() && "expected Step to be one!"); + return Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_SLT && + Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE; + }; + + if (IsUnsupportedPredicate(Step, Result->Pred)) { + DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred + << ")!\n"); + return None; + } return Result; } @@ -621,6 +648,9 @@ bool LoopPredication::runOnLoop(Loop *Loop) { return false; LatchCheck = *LatchCheckOpt; + DEBUG(dbgs() << "Latch check:\n"); + DEBUG(LatchCheck.dump()); + // Collect all the guards into a vector and process later, so as not // to invalidate the instruction iterator. SmallVector Guards; -- cgit v1.2.3 From d1f487bc595728ae9c4dc3aa461a41470e19cf12 Mon Sep 17 00:00:00 2001 From: "Andrew V. Tischenko" Date: Fri, 3 Nov 2017 15:25:13 +0000 Subject: Fix for Bug 34475 - LOCK/REP/REPNE prefixes emitted as instruction on their own. Differential Revision: https://reviews.llvm.org/D39546 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317330 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp | 6 +- lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp | 6 +- test/CodeGen/X86/inline-asm-A-constraint.ll | 3 +- .../AddressSanitizer/X86/asm_rep_movs.ll | 6 +- test/MC/Disassembler/X86/prefixes-i386.txt | 78 ++++++++-------------- test/MC/Disassembler/X86/prefixes-x86_64.txt | 24 +++---- test/MC/Disassembler/X86/prefixes.txt | 66 ++++++------------ test/MC/Disassembler/X86/simple-tests.txt | 9 +-- 8 files changed, 68 insertions(+), 130 deletions(-) diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index 6ff1136cd85..0c99dbbe328 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -54,12 +54,12 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, if (TSFlags & X86II::LOCK) OS << "\tlock\t"; if (!(TSFlags & X86II::LOCK) && Flags & X86::IP_HAS_LOCK) - OS << "\tlock\n"; + OS << "\tlock\t"; if (Flags & X86::IP_HAS_REPEAT_NE) - OS << "\trepne\n"; + OS << "\trepne\t"; else if (Flags & X86::IP_HAS_REPEAT) - OS << "\trep\n"; + OS << "\trep\t"; // Output CALLpcrel32 as "callq" in 64-bit mode. // In Intel annotation it's always emitted as "call". diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 464941a1bab..1f02600a798 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -41,13 +41,13 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, uint64_t TSFlags = Desc.TSFlags; if (TSFlags & X86II::LOCK) - OS << "\tlock\n"; + OS << "\tlock\t"; unsigned Flags = MI->getFlags(); if (Flags & X86::IP_HAS_REPEAT_NE) - OS << "\trepne\n"; + OS << "\trepne\t"; else if (Flags & X86::IP_HAS_REPEAT) - OS << "\trep\n"; + OS << "\trep\t"; printInstruction(MI, OS); diff --git a/test/CodeGen/X86/inline-asm-A-constraint.ll b/test/CodeGen/X86/inline-asm-A-constraint.ll index 2ad011e88e0..7975b318eff 100644 --- a/test/CodeGen/X86/inline-asm-A-constraint.ll +++ b/test/CodeGen/X86/inline-asm-A-constraint.ll @@ -19,8 +19,7 @@ entry: %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1 ret { i64, i64 } %.fca.1.insert } -; CHECK: lock -; CHECK-NEXT: cmpxchg16b +; CHECK: lock cmpxchg16b attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll b/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll index c3c2435fc87..1fc20febc94 100644 --- a/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll +++ b/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll @@ -39,8 +39,7 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: [[B]]: ; CHECK-NEXT: popfq -; CHECK: rep -; CHECK-NEXT: movsb (%rsi), %es:(%rdi) +; CHECK: rep movsb (%rsi), %es:(%rdi) ; Function Attrs: nounwind sanitize_address uwtable define void @rep_movs_1b(i8* %dst, i8* %src, i64 %n) #0 { @@ -73,8 +72,7 @@ entry: ; CHECK: [[Q]]: ; CHECK-NEXT: popfq -; CHECK: rep -; CHECK-NEXT: movsq (%rsi), %es:(%rdi) +; CHECK: rep movsq (%rsi), %es:(%rdi) ; Function Attrs: nounwind sanitize_address uwtable define void @rep_movs_8b(i64* %dst, i64* %src, i64 %n) #0 { diff --git a/test/MC/Disassembler/X86/prefixes-i386.txt b/test/MC/Disassembler/X86/prefixes-i386.txt index ff2fb223873..3152cc31aad 100644 --- a/test/MC/Disassembler/X86/prefixes-i386.txt +++ b/test/MC/Disassembler/X86/prefixes-i386.txt @@ -3,85 +3,59 @@ # CHECK: movl %fs:24, %eax 0x64 0xa1 0x18 0x00 0x00 0x00 # mov eax, dword ptr fs:[18h] -# CHECK: rep -# CHECK-NEXT: insb %dx, %es:(%edi) +# CHECK: rep insb %dx, %es:(%edi) 0xf3 0x6c #rep ins -# CHECK: rep -# CHECK-NEXT: insl %dx, %es:(%edi) +# CHECK: rep insl %dx, %es:(%edi) 0xf3 0x6d #rep ins -# CHECK: rep -# CHECK-NEXT: movsb (%esi), %es:(%edi) +# CHECK: rep movsb (%esi), %es:(%edi) 0xf3 0xa4 #rep movs -# CHECK: rep -# CHECK-NEXT: movsl (%esi), %es:(%edi) +# CHECK: rep movsl (%esi), %es:(%edi) 0xf3 0xa5 #rep movs -# CHECK: rep -# CHECK-NEXT: outsb (%esi), %dx +# CHECK: rep outsb (%esi), %dx 0xf3 0x6e #rep outs -# CHECK: rep -# CHECK-NEXT: outsl (%esi), %dx +# CHECK: rep outsl (%esi), %dx 0xf3 0x6f #rep outs -# CHECK: rep -# CHECK-NEXT: lodsb (%esi), %al +# CHECK: rep lodsb (%esi), %al 0xf3 0xac #rep lods -# CHECK: rep -# CHECK-NEXT: lodsl (%esi), %eax +# CHECK: rep lodsl (%esi), %eax 0xf3 0xad #rep lods -# CHECK: rep -# CHECK-NEXT: stosb %al, %es:(%edi) +# CHECK: rep stosb %al, %es:(%edi) 0xf3 0xaa #rep stos -# CHECK: rep -# CHECK-NEXT: stosl %eax, %es:(%edi) +# CHECK: rep stosl %eax, %es:(%edi) 0xf3 0xab #rep stos -# CHECK: rep -# CHECK-NEXT: cmpsb %es:(%edi), (%esi) +# CHECK: rep cmpsb %es:(%edi), (%esi) 0xf3 0xa6 #rep cmps -# CHECK: rep -# CHECK-NEXT: cmpsl %es:(%edi), (%esi) +# CHECK: rep cmpsl %es:(%edi), (%esi) 0xf3 0xa7 #repe cmps -# CHECK: rep -# CHECK-NEXT: scasb %es:(%edi), %al +# CHECK: rep scasb %es:(%edi), %al 0xf3 0xae #repe scas -# CHECK: rep -# CHECK-NEXT: scasl %es:(%edi), %eax +# CHECK: rep scasl %es:(%edi), %eax 0xf3 0xaf #repe scas -# CHECK: repne -# CHECK-NEXT: cmpsb %es:(%edi), (%esi) +# CHECK: repne cmpsb %es:(%edi), (%esi) 0xf2 0xa6 #repne cmps -# CHECK: repne -# CHECK-NEXT: cmpsl %es:(%edi), (%esi) +# CHECK: repne cmpsl %es:(%edi), (%esi) 0xf2 0xa7 #repne cmps -# CHECK: repne -# CHECK-NEXT: scasb %es:(%edi), %al +# CHECK: repne scasb %es:(%edi), %al 0xf2 0xae #repne scas -# CHECK: repne -# CHECK-NEXT: scasl %es:(%edi), %eax +# CHECK: repne scasl %es:(%edi), %eax 0xf2 0xaf #repne scas -# CHECK: repne -# CHECK-NEXT: scasw %es:(%edi), %ax +# CHECK: repne scasw %es:(%edi), %ax 0xf2 0x66 0xaf -# CHECK: repne -# CHECK-NEXT: scasw %es:(%edi), %ax +# CHECK: repne scasw %es:(%edi), %ax 0x66 0xf2 0xaf -# CHECK: rep -# CHECK-NEXT: scasw %es:(%edi), %ax +# CHECK: rep scasw %es:(%edi), %ax 0xf3 0x66 0xaf -# CHECK: rep -# CHECK-NEXT: scasw %es:(%edi), %ax +# CHECK: rep scasw %es:(%edi), %ax 0x66 0xf3 0xaf -# CHECK: repne -# CHECK: insw %dx, %es:(%edi) +# CHECK: repne insw %dx, %es:(%edi) 0xf2 0x66 0x6d -# CHECK: repne -# CHECK: insw %dx, %es:(%edi) +# CHECK: repne insw %dx, %es:(%edi) 0x66 0xf2 0x6d -# CHECK: rep -# CHECK: insw %dx, %es:(%edi) +# CHECK: rep insw %dx, %es:(%edi) 0xf3 0x66 0x6d -# CHECK: rep -# CHECK: insw %dx, %es:(%edi) +# CHECK: rep insw %dx, %es:(%edi) 0x66 0xf3 0x6d diff --git a/test/MC/Disassembler/X86/prefixes-x86_64.txt b/test/MC/Disassembler/X86/prefixes-x86_64.txt index 7a9208f7b63..c9bf512aa75 100644 --- a/test/MC/Disassembler/X86/prefixes-x86_64.txt +++ b/test/MC/Disassembler/X86/prefixes-x86_64.txt @@ -9,30 +9,22 @@ # CHECK: mulsd %xmm7, %xmm7 0xf2 0x66 0x0f 0x59 0xff -# CHECK: repne -# CHECK-NEXT: scasw %es:(%rdi), %ax +# CHECK: repne scasw %es:(%rdi), %ax 0xf2 0x66 0xaf -# CHECK: rep -# CHECK-NEXT: scasw %es:(%rdi), %ax +# CHECK: repne scasw %es:(%rdi), %ax 0x66 0xf2 0xaf -# CHECK: rep -# CHECK-NEXT: scasw %es:(%rdi), %ax +# CHECK: rep scasw %es:(%rdi), %ax 0xf3 0x66 0xaf -# CHECK: rep -# CHECK-NEXT: scasw %es:(%rdi), %ax +# CHECK: rep scasw %es:(%rdi), %ax 0x66 0xf3 0xaf -# CHECK: repne -# CHECK: insw %dx, %es:(%rdi) +# CHECK: repne insw %dx, %es:(%rdi) 0xf2 0x66 0x6d -# CHECK: repne -# CHECK: insw %dx, %es:(%rdi) +# CHECK: repne insw %dx, %es:(%rdi) 0x66 0xf2 0x6d -# CHECK: rep -# CHECK: insw %dx, %es:(%rdi) +# CHECK: rep insw %dx, %es:(%rdi) 0xf3 0x66 0x6d -# CHECK: rep -# CHECK: insw %dx, %es:(%rdi) +# CHECK: rep insw %dx, %es:(%rdi) 0x66 0xf3 0x6d diff --git a/test/MC/Disassembler/X86/prefixes.txt b/test/MC/Disassembler/X86/prefixes.txt index 983e09670d6..75e11ae93f4 100644 --- a/test/MC/Disassembler/X86/prefixes.txt +++ b/test/MC/Disassembler/X86/prefixes.txt @@ -1,73 +1,53 @@ # RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s -# CHECK: rep -# CHECK-NEXT: insb %dx, %es:(%rdi) +# CHECK: rep insb %dx, %es:(%rdi) 0xf3 0x6c #rep ins -# CHECK: rep -# CHECK-NEXT: insl %dx, %es:(%rdi) +# CHECK: rep insl %dx, %es:(%rdi) 0xf3 0x6d #rep ins -# CHECK: rep -# CHECK-NEXT: movsb (%rsi), %es:(%rdi) +# CHECK: rep movsb (%rsi), %es:(%rdi) 0xf3 0xa4 #rep movs -# CHECK: rep -# CHECK-NEXT: movsl (%rsi), %es:(%rdi) +# CHECK: rep movsl (%rsi), %es:(%rdi) 0xf3 0xa5 #rep movs -# CHECK: rep -# CHECK-NEXT: outsb (%rsi), %dx +# CHECK: rep outsb (%rsi), %dx 0xf3 0x6e #rep outs -# CHECK: rep -# CHECK-NEXT: outsl (%rsi), %dx +# CHECK: rep outsl (%rsi), %dx 0xf3 0x6f #rep outs -# CHECK: rep -# CHECK-NEXT: lodsb (%rsi), %al +# CHECK: rep lodsb (%rsi), %al 0xf3 0xac #rep lods -# CHECK: rep -# CHECK-NEXT: lodsl (%rsi), %eax +# CHECK: rep lodsl (%rsi), %eax 0xf3 0xad #rep lods -# CHECK: rep -# CHECK-NEXT: stosb %al, %es:(%rdi) +# CHECK: rep stosb %al, %es:(%rdi) 0xf3 0xaa #rep stos -# CHECK: rep -# CHECK-NEXT: stosl %eax, %es:(%rdi) +# CHECK: rep stosl %eax, %es:(%rdi) 0xf3 0xab #rep stos -# CHECK: rep -# CHECK-NEXT: cmpsb %es:(%rdi), (%rsi) +# CHECK: rep cmpsb %es:(%rdi), (%rsi) 0xf3 0xa6 #rep cmps -# CHECK: rep -# CHECK-NEXT: cmpsl %es:(%rdi), (%rsi) +# CHECK: rep cmpsl %es:(%rdi), (%rsi) 0xf3 0xa7 #repe cmps -# CHECK: rep -# CHECK-NEXT: scasb %es:(%rdi), %al +# CHECK: rep scasb %es:(%rdi), %al 0xf3 0xae #repe scas -# CHECK: rep -# CHECK-NEXT: scasl %es:(%rdi), %eax +# CHECK: rep scasl %es:(%rdi), %eax 0xf3 0xaf #repe scas -# CHECK: repne -# CHECK-NEXT: cmpsb %es:(%rdi), (%rsi) +# CHECK: repne cmpsb %es:(%rdi), (%rsi) 0xf2 0xa6 #repne cmps -# CHECK: repne -# CHECK-NEXT: cmpsl %es:(%rdi), (%rsi) +# CHECK: repne cmpsl %es:(%rdi), (%rsi) 0xf2 0xa7 #repne cmps -# CHECK: repne -# CHECK-NEXT: scasb %es:(%rdi), %al +# CHECK: repne scasb %es:(%rdi), %al 0xf2 0xae #repne scas -# CHECK: repne -# CHECK-NEXT: scasl %es:(%rdi), %eax +# CHECK: repne scasl %es:(%rdi), %eax 0xf2 0xaf #repne scas # CHECK: lock -# CHECK-NEXT: orl $16, %fs:776 +# CHECK-NEXT: orl $16, %fs:776 0xf0 0x64 0x83 0x0c 0x25 0x08 0x03 0x00 0x00 0x10 # CHECK: movq %fs:768, %rdi 0x64 0x48 0x8b 0x3c 0x25 0x00 0x03 0x00 0x00 -# CHECK: rep -# CHECK-NEXT: stosq %rax, %es:(%rdi) +# CHECK: rep stosq %rax, %es:(%rdi) 0xf3 0x48 0xab -# CHECK: rep -# CHECK-NEXT: stosq %rax, %es:(%edi) +# CHECK: rep stosq %rax, %es:(%edi) 0xf3 0x67 0x48 0xab # CHECK: movl 32(%rbp), %eax @@ -104,11 +84,9 @@ 0x66,0x83,0xc0,0xf4 # Test that multiple redundant prefixes work (redundant, but valid x86). -# CHECK: rep -# CHECK-NEXT: stosq +# CHECK: rep stosq 0xf3 0xf3 0x48 0xab - # Test that we can disassembler control registers above CR8 # CHECK: movq %cr15, %rax 0x44 0x0f 0x20 0xf8 diff --git a/test/MC/Disassembler/X86/simple-tests.txt b/test/MC/Disassembler/X86/simple-tests.txt index 86d9f92fbbf..39074934164 100644 --- a/test/MC/Disassembler/X86/simple-tests.txt +++ b/test/MC/Disassembler/X86/simple-tests.txt @@ -851,14 +851,11 @@ 0xf0 0x48 0x0f 0xc1 0xcb # rdar://13493622 lldb doesn't print the x86 rep/repne prefix when disassembling -# CHECK: repne -# CHECK-NEXT: movsl +# CHECK: repne movsl 0xf2 0xa5 -# CHECK: repne -# CHECK-NEXT: movsq +# CHECK: repne movsq 0xf2 0x48 0xa5 -# CHECK: repne -# CHECK-NEXT: movb $0, (%rax) +# CHECK: repne movb $0, (%rax) 0xf2 0xc6 0x0 0x0 # rdar://11019859 Support 2013 Haswell RTM instructions and HLE prefixes -- cgit v1.2.3 From 876a9b9b65e9035aaf1b22739a2b9c8d9698e242 Mon Sep 17 00:00:00 2001 From: Simon Dardis Date: Fri, 3 Nov 2017 15:35:13 +0000 Subject: [mips] Match 'ins' and its' variants with C++ code Change the ISel matching of 'ins', 'dins[mu]' from tablegen code to C++ code. This resolves an issue where ISel would select 'dins' instead of 'dinsm' when the instructions size and position were individually in range but their sum was out of range according to the ISA specification. Reviewers: atanasyan Differential Revision: https://reviews.llvm.org/D39117 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317331 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MicroMips64r6InstrInfo.td | 7 ++-- lib/Target/Mips/MicroMipsInstrInfo.td | 2 +- lib/Target/Mips/Mips64InstrInfo.td | 6 ++-- lib/Target/Mips/MipsInstrInfo.td | 9 ++--- lib/Target/Mips/MipsSEISelDAGToDAG.cpp | 58 +++++++++++++++++++++++++++++++ test/CodeGen/Mips/dins.ll | 14 +++++--- 6 files changed, 79 insertions(+), 17 deletions(-) diff --git a/lib/Target/Mips/MicroMips64r6InstrInfo.td b/lib/Target/Mips/MicroMips64r6InstrInfo.td index e0f4d833392..4f705feed0a 100644 --- a/lib/Target/Mips/MicroMips64r6InstrInfo.td +++ b/lib/Target/Mips/MicroMips64r6InstrInfo.td @@ -162,12 +162,11 @@ class DCLZ_MM64R6_DESC { class DINSU_MM64R6_DESC : InsBase<"dinsu", GPR64Opnd, uimm5_plus32, uimm5_inssize_plus1, immZExt5Plus32, - immZExt5Plus1, MipsIns>; + immZExt5Plus1>; class DINSM_MM64R6_DESC : InsBase<"dinsm", GPR64Opnd, uimm5, uimm_range_2_64, - immZExt5, immZExtRange2To64, MipsIns>; + immZExt5, immZExtRange2To64>; class DINS_MM64R6_DESC : InsBase<"dins", GPR64Opnd, uimm5_report_uimm6, - uimm5_inssize_plus1, immZExt5, immZExt5Plus1, - MipsIns>; + uimm5_inssize_plus1, immZExt5, immZExt5Plus1>; class DMTC0_MM64R6_DESC : MTC0_MMR6_DESC_BASE<"dmtc0", COP0Opnd, GPR64Opnd, II_DMTC0>; class DMTC1_MM64R6_DESC : MTC1_MMR6_DESC_BASE<"dmtc1", FGR64Opnd, GPR64Opnd, diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td index 1f869db4efe..90399ddfab5 100644 --- a/lib/Target/Mips/MicroMipsInstrInfo.td +++ b/lib/Target/Mips/MicroMipsInstrInfo.td @@ -884,7 +884,7 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in { def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, immZExt5, immZExt5Plus1, MipsExt>, EXT_FM_MM<0x2c>; def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, uimm5_inssize_plus1, - immZExt5, immZExt5Plus1, MipsIns>, + immZExt5, immZExt5Plus1>, EXT_FM_MM<0x0c>; /// Jump Instructions diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td index 04a050c2ff4..dbd47de4dad 100644 --- a/lib/Target/Mips/Mips64InstrInfo.td +++ b/lib/Target/Mips/Mips64InstrInfo.td @@ -341,13 +341,13 @@ let AdditionalPredicates = [NotInMicroMips] in { // for dinsm and dinsu like binutils. let DecoderMethod = "DecodeDINS" in { def DINS : InsBase<"dins", GPR64Opnd, uimm6, uimm5_inssize_plus1, - immZExt5, immZExt5Plus1, MipsIns>, EXT_FM<7>, + immZExt5, immZExt5Plus1>, EXT_FM<7>, ISA_MIPS64R2; def DINSU : InsBase<"dinsu", GPR64Opnd, uimm5_plus32, uimm5_inssize_plus1, - immZExt5Plus32, immZExt5Plus1, MipsIns>, + immZExt5Plus32, immZExt5Plus1>, EXT_FM<6>, ISA_MIPS64R2; def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5, uimm_range_2_64, - immZExt5, immZExtRange2To64, MipsIns>, + immZExt5, immZExtRange2To64>, EXT_FM<5>, ISA_MIPS64R2; } } diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index c4c3eb760c5..ac4980e99a7 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -1726,12 +1726,13 @@ class ExtBase, ISA_MIPS32R2; +// 'ins' and its' 64 bit variants are matched by C++ code. class InsBase: + Operand SizeOpnd, PatFrag PosImm, PatFrag SizeImm>: InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size, RO:$src), !strconcat(opstr, " $rt, $rs, $pos, $size"), - [(set RO:$rt, (Op RO:$rs, PosImm:$pos, SizeImm:$size, RO:$src))], + [(set RO:$rt, (null_frag RO:$rs, PosImm:$pos, SizeImm:$size, + RO:$src))], II_INS, FrmR, opstr>, ISA_MIPS32R2 { let Constraints = "$src = $rt"; } @@ -2236,7 +2237,7 @@ let AdditionalPredicates = [NotInMicroMips] in { EXT_FM<0>; def INS : MMRel, StdMMR6Rel, InsBase<"ins", GPR32Opnd, uimm5, uimm5_inssize_plus1, immZExt5, - immZExt5Plus1, MipsIns>, + immZExt5Plus1>, EXT_FM<4>; } /// Move Control Registers From/To CPU Registers diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index 283fcaa73a7..3c6a7d7a665 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -905,6 +905,64 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) { break; } + // Manually match MipsISD::Ins nodes to get the correct instruction. It has + // to be done in this fashion so that we respect the differences between + // dins and dinsm, as the difference is that the size operand has the range + // 0 < size <= 32 for dins while dinsm has the range 2 <= size <= 64 which + // means SelectionDAGISel would have to test all the operands at once to + // match the instruction. + case MipsISD::Ins: { + + // Sanity checking for the node operands. + if (Node->getValueType(0) != MVT::i32 && Node->getValueType(0) != MVT::i64) + return false; + + if (Node->getNumOperands() != 4) + return false; + + if (Node->getOperand(1)->getOpcode() != ISD::Constant || + Node->getOperand(2)->getOpcode() != ISD::Constant) + return false; + + MVT ResTy = Node->getSimpleValueType(0); + uint64_t Pos = Node->getConstantOperandVal(1); + uint64_t Size = Node->getConstantOperandVal(2); + + // Size has to be >0 for 'ins', 'dins' and 'dinsu'. + if (!Size) + return false; + + if (Pos + Size > 64) + return false; + + if (ResTy != MVT::i32 && ResTy != MVT::i64) + return false; + + unsigned Opcode = 0; + if (ResTy == MVT::i32) { + if (Pos + Size <= 32) + Opcode = Mips::INS; + } else { + if (Pos + Size <= 32) + Opcode = Mips::DINS; + else if (Pos < 32 && 1 < Size) + Opcode = Mips::DINSM; + else + Opcode = Mips::DINSU; + } + + if (Opcode) { + SDValue Ops[4] = { + Node->getOperand(0), CurDAG->getTargetConstant(Pos, DL, MVT::i32), + CurDAG->getTargetConstant(Size, DL, MVT::i32), Node->getOperand(3)}; + + ReplaceNode(Node, CurDAG->getMachineNode(Opcode, DL, ResTy, Ops)); + return true; + } + + return false; + } + case MipsISD::ThreadPointer: { EVT PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout()); unsigned RdhwrOpc, DestReg; diff --git a/test/CodeGen/Mips/dins.ll b/test/CodeGen/Mips/dins.ll index 8a8b377861a..2f7138ca4c5 100644 --- a/test/CodeGen/Mips/dins.ll +++ b/test/CodeGen/Mips/dins.ll @@ -1,7 +1,11 @@ -; RUN: llc -O2 -march=mips64 -mcpu=mips64r2 -target-abi=n64 < %s -o - | FileCheck %s -check-prefix=MIPS64R2 -; RUN: llc -O2 -march=mips -mcpu=mips32r2 < %s -o - | FileCheck %s -check-prefix=MIPS32R2 -; RUN: llc -O2 -march=mips -mattr=mips16 < %s -o - | FileCheck %s -check-prefix=MIPS16 -; RUN: llc -O2 -march=mips64 -mcpu=mips64r2 -target-abi=n32 < %s -o - | FileCheck %s -check-prefix=MIPS64R2N32 +; RUN: llc -O2 -verify-machineinstrs -march=mips64 -mcpu=mips64r2 \ +; RUN: -target-abi=n64 < %s -o - | FileCheck %s -check-prefix=MIPS64R2 +; RUN: llc -O2 -verify-machineinstrs -march=mips -mcpu=mips32r2 < %s -o - \ +; RUN: | FileCheck %s -check-prefix=MIPS32R2 +; RUN: llc -O2 -verify-machineinstrs -march=mips -mattr=mips16 < %s -o - \ +; RUN: | FileCheck %s -check-prefix=MIPS16 +; RUN: llc -O2 -verify-machineinstrs -march=mips64 -mcpu=mips64r2 \ +; RUN: -target-abi=n32 < %s -o - | FileCheck %s -check-prefix=MIPS64R2N32 ; #include ; #include @@ -60,7 +64,7 @@ entry: ; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 123 ; MIPS64R2: dinsm $[[R0:[0-9]+]], $[[R1:[0-9]+]], 27, 37 ; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 4 -; MIPS64R2: dins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 28, 6 +; MIPS64R2: dinsm $[[R0:[0-9]+]], $[[R1:[0-9]+]], 28, 6 ; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 5 ; MIPS64R2: dinsu $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50, 14 ; MIPS64R2: dsrl $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50 -- cgit v1.2.3 From d16b502afd11c5c7f2883da31b63460eea106ae7 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 3 Nov 2017 16:17:13 +0000 Subject: [SLP] Test for PR23510, NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317334 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SLPVectorizer/X86/stores_vectorize.ll | 84 ++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 test/Transforms/SLPVectorizer/X86/stores_vectorize.ll diff --git a/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll b/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll new file mode 100644 index 00000000000..79fb782db8f --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll @@ -0,0 +1,84 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s + +;void Distance(float *p1, int p2, unsigned long p3[], float p4[]) { +; long a = p3[0] = 5; +; p1 += p2; +; p4[3] += p1[a]; +; p3[0] >>= 5; +; p3[1] >>= 5; +; p3[2] >>= 5; +; p3[3] >>= 5; +; p1 += p2; +; p4[0] += p1[p3[0] & a]; +;} + +define void @_Z8DistanceIlLi5EEvPfiPmS0_(float* %p1, i32 %p2, i64* %p3, float* %p4) { +; CHECK-LABEL: @_Z8DistanceIlLi5EEvPfiPmS0_( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i64 5, i64* [[P3:%.*]], align 8 +; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[P2:%.*]] to i64 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[P1:%.*]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 5 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[P4:%.*]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]] +; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* [[P3]], align 8 +; CHECK-NEXT: [[SHR:%.*]] = lshr i64 [[TMP2]], 5 +; CHECK-NEXT: store i64 [[SHR]], i64* [[P3]], align 8 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ARRAYIDX4]] to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = lshr <2 x i64> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX4]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3 +; CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[ARRAYIDX8]], align 8 +; CHECK-NEXT: [[SHR9:%.*]] = lshr i64 [[TMP7]], 5 +; CHECK-NEXT: store i64 [[SHR9]], i64* [[ARRAYIDX8]], align 8 +; CHECK-NEXT: [[ADD_PTR11:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[AND:%.*]] = and i64 [[SHR]], 5 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[ADD_PTR11]], i64 [[AND]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[P4]], align 4 +; CHECK-NEXT: [[ADD15:%.*]] = fadd float [[TMP8]], [[TMP9]] +; CHECK-NEXT: store float [[ADD15]], float* [[P4]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i64 5, i64* %p3, align 8 + %idx.ext = sext i32 %p2 to i64 + %add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext + %arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5 + %0 = load float, float* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds float, float* %p4, i64 3 + %1 = load float, float* %arrayidx2, align 4 + %add = fadd float %0, %1 + store float %add, float* %arrayidx2, align 4 + %2 = load i64, i64* %p3, align 8 + %shr = lshr i64 %2, 5 + store i64 %shr, i64* %p3, align 8 + %arrayidx4 = getelementptr inbounds i64, i64* %p3, i64 1 + %3 = load i64, i64* %arrayidx4, align 8 + %shr5 = lshr i64 %3, 5 + store i64 %shr5, i64* %arrayidx4, align 8 + %arrayidx6 = getelementptr inbounds i64, i64* %p3, i64 2 + %4 = load i64, i64* %arrayidx6, align 8 + %shr7 = lshr i64 %4, 5 + store i64 %shr7, i64* %arrayidx6, align 8 + %arrayidx8 = getelementptr inbounds i64, i64* %p3, i64 3 + %5 = load i64, i64* %arrayidx8, align 8 + %shr9 = lshr i64 %5, 5 + store i64 %shr9, i64* %arrayidx8, align 8 + %add.ptr11 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext + %and = and i64 %shr, 5 + %arrayidx13 = getelementptr inbounds float, float* %add.ptr11, i64 %and + %6 = load float, float* %arrayidx13, align 4 + %7 = load float, float* %p4, align 4 + %add15 = fadd float %6, %7 + store float %add15, float* %p4, align 4 + ret void +} -- cgit v1.2.3 From 7c2eb4ec8b267bb3887787bf8e2afe800a72828a Mon Sep 17 00:00:00 2001 From: Jun Bum Lim Date: Fri, 3 Nov 2017 16:24:53 +0000 Subject: [LICM] sink through non-trivially replicable PHI Summary: The current LICM allows sinking an instruction only when it is exposed to exit blocks through a trivially replacable PHI of which all incoming values are the same instruction. This change enhance LICM to sink a sinkable instruction through non-trivially replacable PHIs by spliting predecessors of loop exits. Reviewers: hfinkel, majnemer, davidxl, bmakam, mcrosier, danielcdh, efriedma, jtony Reviewed By: efriedma Subscribers: nemanjai, dberlin, llvm-commits Differential Revision: https://reviews.llvm.org/D37163 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317335 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/LICM.cpp | 196 +++++++++++++++------- test/CodeGen/PowerPC/subreg-postra-2.ll | 8 +- test/Transforms/LICM/sinking.ll | 284 +++++++++++++++++++++++++++++++- 3 files changed, 427 insertions(+), 61 deletions(-) diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 6ca8d602302..c60ec9f50f7 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -62,6 +62,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -93,9 +94,8 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop, static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE); -static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, - const Loop *CurLoop, AliasSetTracker *CurAST, - const LoopSafetyInfo *SafetyInfo, +static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, + const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE); static bool isSafeToExecuteUnconditionally(Instruction &Inst, const DominatorTree *DT, @@ -394,8 +394,12 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, // if (isNotUsedInLoop(I, CurLoop, SafetyInfo) && canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) { - ++II; - Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo, ORE); + if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE)) { + ++II; + CurAST->deleteValue(&I); + I.eraseFromParent(); + Changed = true; + } } } } @@ -717,26 +721,6 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop, if (!BlockColors.empty() && BlockColors.find(const_cast(BB))->second.size() != 1) return false; - - // A PHI node where all of the incoming values are this instruction are - // special -- they can just be RAUW'ed with the instruction and thus - // don't require a use in the predecessor. This is a particular important - // special case because it is the pattern found in LCSSA form. - if (isTriviallyReplacablePHI(*PN, I)) { - if (CurLoop->contains(PN)) - return false; - else - continue; - } - - // Otherwise, PHI node uses occur in predecessor blocks if the incoming - // values. Check for such a use being inside the loop. - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (PN->getIncomingValue(i) == &I) - if (CurLoop->contains(PN->getIncomingBlock(i))) - return false; - - continue; } if (CurLoop->contains(UI)) @@ -806,14 +790,96 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN, return New; } +static Instruction *sinkThroughTriviallyReplacablePHI( + PHINode *TPN, Instruction *I, LoopInfo *LI, + SmallDenseMap &SunkCopies, + const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop) { + assert(isTriviallyReplacablePHI(*TPN, *I) && + "Expect only trivially replacalbe PHI"); + BasicBlock *ExitBlock = TPN->getParent(); + Instruction *New; + auto It = SunkCopies.find(ExitBlock); + if (It != SunkCopies.end()) + New = It->second; + else + New = SunkCopies[ExitBlock] = + CloneInstructionInExitBlock(*I, *ExitBlock, *TPN, LI, SafetyInfo); + return New; +} + +static bool canSplitPredecessors(PHINode *PN) { + BasicBlock *BB = PN->getParent(); + if (!BB->canSplitPredecessors()) + return false; + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *BBPred = *PI; + if (isa(BBPred->getTerminator())) + return false; + } + return true; +} + +static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, + LoopInfo *LI, const Loop *CurLoop) { +#ifndef NDEBUG + SmallVector ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + SmallPtrSet ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); +#endif + BasicBlock *ExitBB = PN->getParent(); + assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block."); + + // Split predecessors of the loop exit to make instructions in the loop are + // exposed to exit blocks through trivially replacable PHIs while keeping the + // loop in the canonical form where each predecessor of each exit block should + // be contained within the loop. For example, this will convert the loop below + // from + // + // LB1: + // %v1 = + // br %LE, %LB2 + // LB2: + // %v2 = + // br %LE, %LB1 + // LE: + // %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replacable + // + // to + // + // LB1: + // %v1 = + // br %LE.split, %LB2 + // LB2: + // %v2 = + // br %LE.split2, %LB1 + // LE.split: + // %p1 = phi [%v1, %LB1] <-- trivially replacable + // br %LE + // LE.split2: + // %p2 = phi [%v2, %LB2] <-- trivially replacable + // br %LE + // LE: + // %p = phi [%p1, %LE.split], [%p2, %LE.split2] + // + SmallSetVector PredBBs(pred_begin(ExitBB), pred_end(ExitBB)); + while (!PredBBs.empty()) { + BasicBlock *PredBB = *PredBBs.begin(); + assert(CurLoop->contains(PredBB) && + "Expect all predecessors are in the loop"); + if (PN->getBasicBlockIndex(PredBB) >= 0) + SplitBlockPredecessors(ExitBB, PredBB, ".split.loop.exit", DT, LI, true); + PredBBs.remove(PredBB); + } +} + /// When an instruction is found to only be used outside of the loop, this /// function moves it to the exit blocks and patches up SSA form as needed. /// This method is guaranteed to remove the original instruction from its /// position, and may either delete it or move it to outside of the loop. /// -static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, - const Loop *CurLoop, AliasSetTracker *CurAST, - const LoopSafetyInfo *SafetyInfo, +static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, + const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) { DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); ORE->emit([&]() { @@ -828,57 +894,75 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, ++NumSunk; Changed = true; -#ifndef NDEBUG - SmallVector ExitBlocks; - CurLoop->getUniqueExitBlocks(ExitBlocks); - SmallPtrSet ExitBlockSet(ExitBlocks.begin(), - ExitBlocks.end()); -#endif + // Iterate over users to be ready for actual sinking. Replace users via + // unrechable blocks with undef and make all user PHIs trivially replcable. + SmallPtrSet VisitedUsers; + for (Value::user_iterator UI = I.user_begin(), UE = I.user_end(); UI != UE;) { + auto *User = cast(*UI); + Use &U = UI.getUse(); + ++UI; - // Clones of this instruction. Don't create more than one per exit block! - SmallDenseMap SunkCopies; + if (VisitedUsers.count(User)) + continue; - // If this instruction is only used outside of the loop, then all users are - // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of - // the instruction. - while (!I.use_empty()) { - Value::user_iterator UI = I.user_begin(); - auto *User = cast(*UI); if (!DT->isReachableFromEntry(User->getParent())) { User->replaceUsesOfWith(&I, UndefValue::get(I.getType())); continue; } + // The user must be a PHI node. PHINode *PN = cast(User); // Surprisingly, instructions can be used outside of loops without any // exits. This can only happen in PHI nodes if the incoming block is // unreachable. - Use &U = UI.getUse(); BasicBlock *BB = PN->getIncomingBlock(U); if (!DT->isReachableFromEntry(BB)) { U = UndefValue::get(I.getType()); continue; } - BasicBlock *ExitBlock = PN->getParent(); - assert(ExitBlockSet.count(ExitBlock) && - "The LCSSA PHI is not in an exit block!"); + VisitedUsers.insert(PN); + if (isTriviallyReplacablePHI(*PN, I)) + continue; - Instruction *New; - auto It = SunkCopies.find(ExitBlock); - if (It != SunkCopies.end()) - New = It->second; - else - New = SunkCopies[ExitBlock] = - CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI, SafetyInfo); + if (!canSplitPredecessors(PN)) + return false; + + // Split predecessors of the PHI so that we can make users trivially + // replacable. + splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop); + // Should rebuild the iterators, as they may be invalidated by + // splitPredecessorsOfLoopExit(). + UI = I.user_begin(); + UE = I.user_end(); + } + +#ifndef NDEBUG + SmallVector ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + SmallPtrSet ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); +#endif + + // Clones of this instruction. Don't create more than one per exit block! + SmallDenseMap SunkCopies; + + // If this instruction is only used outside of the loop, then all users are + // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of + // the instruction. + while (!I.use_empty()) { + Value::user_iterator UI = I.user_begin(); + PHINode *PN = cast(*UI); + assert(ExitBlockSet.count(PN->getParent()) && + "The LCSSA PHI is not in an exit block!"); + // The PHI must be trivially replacable. + Instruction *New = sinkThroughTriviallyReplacablePHI(PN, &I, LI, SunkCopies, + SafetyInfo, CurLoop); PN->replaceAllUsesWith(New); PN->eraseFromParent(); } - - CurAST->deleteValue(&I); - I.eraseFromParent(); return Changed; } diff --git a/test/CodeGen/PowerPC/subreg-postra-2.ll b/test/CodeGen/PowerPC/subreg-postra-2.ll index 338000cd8ba..794c9c190d1 100644 --- a/test/CodeGen/PowerPC/subreg-postra-2.ll +++ b/test/CodeGen/PowerPC/subreg-postra-2.ll @@ -1,5 +1,5 @@ -; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s -; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gep-opt=0 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false -ppc-gep-opt=0 < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -38,10 +38,10 @@ while.end418: ; preds = %wait_on_buffer.exit ; CHECK: stdcx. ; CHECK: isel {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, [[REG]] ; CHECK-NO-ISEL: bc 12, 20, [[TRUE:.LBB[0-9]+]] -; CHECK-NO-ISEL: ori 4, 7, 0 +; CHECK-NO-ISEL: ori 7, 8, 0 ; CHECK-NO-ISEL-NEXT: b [[SUCCESSOR:.LBB[0-9]+]] ; CHECK-NO-ISEL: [[TRUE]] -; CHECK-NO-ISEL-NEXT: addi 4, 3, 0 +; CHECK-NO-ISEL: addi 7, 3, 0 if.then420: ; preds = %while.end418 unreachable diff --git a/test/Transforms/LICM/sinking.ll b/test/Transforms/LICM/sinking.ll index 6e9e8d4b7b6..b28eea0bc2a 100644 --- a/test/Transforms/LICM/sinking.ll +++ b/test/Transforms/LICM/sinking.ll @@ -392,6 +392,288 @@ lab60: indirectbr i8* undef, [label %lab21, label %lab19] } -declare void @f(i32*) +; Check if LICM can sink a sinkable instruction the exit blocks through +; a non-trivially replacable PHI node. +; +; CHECK-LABEL: @test14 +; CHECK-LABEL: Loop: +; CHECK-NOT: mul +; CHECK-NOT: sub +; +; CHECK-LABEL: Out12.split.loop.exit: +; CHECK: %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop ] +; CHECK: %[[MUL:.*]] = mul i32 %N, %[[LCSSAPHI]] +; CHECK: br label %Out12 +; +; CHECK-LABEL: Out12.split.loop.exit1: +; CHECK: %[[LCSSAPHI2:.*]] = phi i32 [ %N_addr.0.pn, %Loop ] +; CHECK: %[[MUL2:.*]] = mul i32 %N, %[[LCSSAPHI2]] +; CHECK: %[[SUB:.*]] = sub i32 %[[MUL2]], %N +; CHECK: br label %Out12 +; +; CHECK-LABEL: Out12: +; CHECK: phi i32 [ %[[MUL]], %Out12.split.loop.exit ], [ %[[SUB]], %Out12.split.loop.exit1 ] +define i32 @test14(i32 %N, i32 %N2, i1 %C) { +Entry: + br label %Loop +Loop: + %N_addr.0.pn = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ] + %sink.mul = mul i32 %N, %N_addr.0.pn + %sink.sub = sub i32 %sink.mul, %N + %dec = add i32 %N_addr.0.pn, -1 + br i1 %C, label %ContLoop, label %Out12 +ContLoop: + %tmp.1 = icmp ne i32 %N_addr.0.pn, 1 + br i1 %tmp.1, label %Loop, label %Out12 +Out12: + %tmp = phi i32 [%sink.mul, %ContLoop], [%sink.sub, %Loop] + ret i32 %tmp +} + +; In this test, splitting predecessors is not really required because the +; operations of sinkable instructions (sub and mul) are same. In this case, we +; can sink the same sinkable operations and modify the PHI to pass the operands +; to the shared operations. As of now, we split predecessors of non-trivially +; replicalbe PHIs by default in LICM because all incoming edges of a +; non-trivially replacable PHI in LCSSA is critical. +; +; CHECK-LABEL: @test15 +; CHECK-LABEL: Loop: +; CHECK-NOT: mul +; CHECK-NOT: sub +; +; CHECK-LABEL: Out12.split.loop.exit: +; CHECK: %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop ] +; CHECK: %[[MUL:.*]] = mul i32 %N, %[[LCSSAPHI]] +; CHECK: %[[SUB:.*]] = sub i32 %[[MUL]], %N2 +; CHECK: br label %Out12 +; +; CHECK-LABEL: Out12.split.loop.exit1: +; CHECK: %[[LCSSAPHI2:.*]] = phi i32 [ %N_addr.0.pn, %Loop ] +; CHECK: %[[MUL2:.*]] = mul i32 %N, %[[LCSSAPHI2]] +; CHECK: %[[SUB2:.*]] = sub i32 %[[MUL2]], %N +; CHECK: br label %Out12 +; +; CHECK-LABEL: Out12: +; CHECK: phi i32 [ %[[SUB]], %Out12.split.loop.exit ], [ %[[SUB2]], %Out12.split.loop.exit1 ] +define i32 @test15(i32 %N, i32 %N2, i1 %C) { +Entry: + br label %Loop +Loop: + %N_addr.0.pn = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ] + %sink.mul = mul i32 %N, %N_addr.0.pn + %sink.sub = sub i32 %sink.mul, %N + %sink.sub2 = sub i32 %sink.mul, %N2 + %dec = add i32 %N_addr.0.pn, -1 + br i1 %C, label %ContLoop, label %Out12 +ContLoop: + %tmp.1 = icmp ne i32 %N_addr.0.pn, 1 + br i1 %tmp.1, label %Loop, label %Out12 +Out12: + %tmp = phi i32 [%sink.sub2, %ContLoop], [%sink.sub, %Loop] + ret i32 %tmp +} + +; Sink through a non-trivially replacable PHI node which use the same sinkable +; instruction multiple times. +; +; CHECK-LABEL: @test16 +; CHECK-LABEL: Loop: +; CHECK-NOT: mul +; +; CHECK-LABEL: Out.split.loop.exit: +; CHECK: %[[PHI:.*]] = phi i32 [ %l2, %ContLoop ] +; CHECK: br label %Out +; +; CHECK-LABEL: Out.split.loop.exit1: +; CHECK: %[[SINKABLE:.*]] = mul i32 %l2.lcssa, %t.le +; CHECK: br label %Out +; +; CHECK-LABEL: Out: +; CHECK: %idx = phi i32 [ %[[PHI]], %Out.split.loop.exit ], [ %[[SINKABLE]], %Out.split.loop.exit1 ] +define i32 @test16(i1 %c, i8** %P, i32* %P2, i64 %V) { +entry: + br label %loop.ph +loop.ph: + br label %Loop +Loop: + %iv = phi i64 [ 0, %loop.ph ], [ %next, %ContLoop ] + %l2 = call i32 @getv() + %t = trunc i64 %iv to i32 + %sinkable = mul i32 %l2, %t + switch i32 %l2, label %ContLoop [ + i32 32, label %Out + i32 46, label %Out + i32 95, label %Out + ] +ContLoop: + %next = add nuw i64 %iv, 1 + %c1 = call i1 @getc() + br i1 %c1, label %Loop, label %Out +Out: + %idx = phi i32 [ %l2, %ContLoop ], [ %sinkable, %Loop ], [ %sinkable, %Loop ], [ %sinkable, %Loop ] + ret i32 %idx +} + +; Sink a sinkable instruction through multiple non-trivially replacable PHIs in +; differect exit blocks. +; +; CHECK-LABEL: @test17 +; CHECK-LABEL: Loop: +; CHECK-NOT: mul +; +; CHECK-LABEL:OutA.split.loop.exit{{.*}}: +; CHECK: %[[OP1:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop1 ] +; CHECK: %[[SINKABLE:.*]] = mul i32 %N, %[[OP1]] +; CHECK: br label %OutA +; +; CHECK-LABEL:OutA: +; CHECK: phi i32{{.*}}[ %[[SINKABLE]], %OutA.split.loop.exit{{.*}} ] +; +; CHECK-LABEL:OutB.split.loop.exit{{.*}}: +; CHECK: %[[OP2:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop2 ] +; CHECK: %[[SINKABLE2:.*]] = mul i32 %N, %[[OP2]] +; CHECK: br label %OutB +; +; CHECK-LABEL:OutB: +; CHECK: phi i32 {{.*}}[ %[[SINKABLE2]], %OutB.split.loop.exit{{.*}} ] +define i32 @test17(i32 %N, i32 %N2) { +Entry: + br label %Loop +Loop: + %N_addr.0.pn = phi i32 [ %dec, %ContLoop3 ], [ %N, %Entry ] + %sink.mul = mul i32 %N, %N_addr.0.pn + %c0 = call i1 @getc() + br i1 %c0 , label %ContLoop1, label %OutA +ContLoop1: + %c1 = call i1 @getc() + br i1 %c1, label %ContLoop2, label %OutA + +ContLoop2: + %c2 = call i1 @getc() + br i1 %c2, label %ContLoop3, label %OutB +ContLoop3: + %c3 = call i1 @getc() + %dec = add i32 %N_addr.0.pn, -1 + br i1 %c3, label %Loop, label %OutB +OutA: + %tmp1 = phi i32 [%sink.mul, %ContLoop1], [%N2, %Loop] + br label %Out12 +OutB: + %tmp2 = phi i32 [%sink.mul, %ContLoop2], [%dec, %ContLoop3] + br label %Out12 +Out12: + %tmp = phi i32 [%tmp1, %OutA], [%tmp2, %OutB] + ret i32 %tmp +} + + +; Sink a sinkable instruction through both trivially and non-trivially replacable PHIs. +; +; CHECK-LABEL: @test18 +; CHECK-LABEL: Loop: +; CHECK-NOT: mul +; CHECK-NOT: sub +; +; CHECK-LABEL:Out12.split.loop.exit: +; CHECK: %[[OP:.*]] = phi i32 [ %iv, %ContLoop ] +; CHECK: %[[DEC:.*]] = phi i32 [ %dec, %ContLoop ] +; CHECK: %[[SINKMUL:.*]] = mul i32 %N, %[[OP]] +; CHECK: %[[SINKSUB:.*]] = sub i32 %[[SINKMUL]], %N2 +; CHECK: br label %Out12 +; +; CHECK-LABEL:Out12.split.loop.exit1: +; CHECK: %[[OP2:.*]] = phi i32 [ %iv, %Loop ] +; CHECK: %[[SINKMUL2:.*]] = mul i32 %N, %[[OP2]] +; CHECK: %[[SINKSUB2:.*]] = sub i32 %[[SINKMUL2]], %N2 +; CHECK: br label %Out12 +; +; CHECK-LABEL:Out12: +; CHECK: %tmp1 = phi i32 [ %[[SINKSUB]], %Out12.split.loop.exit ], [ %[[SINKSUB2]], %Out12.split.loop.exit1 ] +; CHECK: %tmp2 = phi i32 [ %[[DEC]], %Out12.split.loop.exit ], [ %[[SINKSUB2]], %Out12.split.loop.exit1 ] +; CHECK: %add = add i32 %tmp1, %tmp2 +define i32 @test18(i32 %N, i32 %N2) { +Entry: + br label %Loop +Loop: + %iv = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ] + %sink.mul = mul i32 %N, %iv + %sink.sub = sub i32 %sink.mul, %N2 + %c0 = call i1 @getc() + br i1 %c0, label %ContLoop, label %Out12 +ContLoop: + %dec = add i32 %iv, -1 + %c1 = call i1 @getc() + br i1 %c1, label %Loop, label %Out12 +Out12: + %tmp1 = phi i32 [%sink.sub, %ContLoop], [%sink.sub, %Loop] + %tmp2 = phi i32 [%dec, %ContLoop], [%sink.sub, %Loop] + %add = add i32 %tmp1, %tmp2 + ret i32 %add +} + +; Do not sink an instruction through a non-trivially replacable PHI, to avoid +; assert while splitting predecessors, if the terminator of predecessor is an +; indirectbr. +; CHECK-LABEL: @test19 +; CHECK-LABEL: L0: +; CHECK: %sinkable = mul +; CHECK: %sinkable2 = add + +define i32 @test19(i1 %cond, i1 %cond2, i8* %address, i32 %v1) nounwind { +entry: + br label %L0 +L0: + %indirect.goto.dest = select i1 %cond, i8* blockaddress(@test19, %exit), i8* %address + %v2 = call i32 @getv() + %sinkable = mul i32 %v1, %v2 + %sinkable2 = add i32 %v1, %v2 + indirectbr i8* %indirect.goto.dest, [label %L1, label %exit] + +L1: + %indirect.goto.dest2 = select i1 %cond2, i8* blockaddress(@test19, %exit), i8* %address + indirectbr i8* %indirect.goto.dest2, [label %L0, label %exit] + +exit: + %r = phi i32 [%sinkable, %L0], [%sinkable2, %L1] + ret i32 %r +} + +; Do not sink through a non-trivially replacable PHI if splitting predecessors +; not allowed in SplitBlockPredecessors(). +; +; CHECK-LABEL: @test20 +; CHECK-LABEL: while.cond +; CHECK: %sinkable = mul +; CHECK: %sinkable2 = add +define void @test20(i32* %s, i1 %b, i32 %v1, i32 %v2) personality i32 (...)* @__CxxFrameHandler3 { +entry: + br label %while.cond +while.cond: + %v = call i32 @getv() + %sinkable = mul i32 %v, %v2 + %sinkable2 = add i32 %v, %v2 + br i1 %b, label %try.cont, label %while.body +while.body: + invoke void @may_throw() + to label %while.body2 unwind label %catch.dispatch +while.body2: + invoke void @may_throw2() + to label %while.cond unwind label %catch.dispatch +catch.dispatch: + %.lcssa1 = phi i32 [ %sinkable, %while.body ], [ %sinkable2, %while.body2 ] + %cp = cleanuppad within none [] + store i32 %.lcssa1, i32* %s + cleanupret from %cp unwind to caller +try.cont: + ret void +} + +declare void @may_throw() +declare void @may_throw2() +declare i32 @__CxxFrameHandler3(...) +declare i32 @getv() +declare i1 @getc() +declare void @f(i32*) declare void @g() -- cgit v1.2.3 From 604f04f397ea185b505dcc4ea8cd16bce7ccbbea Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 3 Nov 2017 18:00:02 +0000 Subject: Invoke salvageDebugInfo from CodeGenPrepare's SinkCast() This preserves the debug info for the cast operation in the original location. rdar://problem/33460652 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317340 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CodeGenPrepare.cpp | 1 + lib/Transforms/Utils/Local.cpp | 2 +- .../CodeGenPrepare/salvage-debug-info.ll | 118 +++++++++++++++++++++ 3 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 test/Transforms/CodeGenPrepare/salvage-debug-info.ll diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 973ddebd987..73f014704b8 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -1171,6 +1171,7 @@ static bool SinkCast(CastInst *CI) { // If we removed all uses, nuke the cast. if (CI->use_empty()) { + salvageDebugInfo(*CI); CI->eraseFromParent(); MadeChange = true; } diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 8c643c93ec4..cb7978f76aa 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -1366,7 +1366,7 @@ void llvm::salvageDebugInfo(Instruction &I) { return MetadataAsValue::get(I.getContext(), ValueAsMetadata::get(V)); }; - if (isa(&I)) { + if (isa(&I) || isa(&I)) { findDbgValues(DbgValues, &I); for (auto *DVI : DbgValues) { // Bitcasts are entirely irrelevant for debug info. Rewrite the dbg.value diff --git a/test/Transforms/CodeGenPrepare/salvage-debug-info.ll b/test/Transforms/CodeGenPrepare/salvage-debug-info.ll new file mode 100644 index 00000000000..5509b92a5c1 --- /dev/null +++ b/test/Transforms/CodeGenPrepare/salvage-debug-info.ll @@ -0,0 +1,118 @@ +; RUN: opt -codegenprepare -S %s -o - | FileCheck %s +; typedef struct info { +; unsigned long long size; +; } info_t; +; extern unsigned p; +; extern unsigned n; +; void f() { +; unsigned int i; +; if (p) { +; info_t *info = (info_t *)p; +; for (i = 0; i < n; i++) +; use(info[i].size); +; } +; } +source_filename = "debug.i" +target datalayout = "e-m:o-p:32:32-i64:64-a:0:32-n32-S128" +target triple = "thumbv7k-apple-ios10.0.0" + +%struct.info = type { i64 } + +@p = external local_unnamed_addr global i32, align 4 +@n = external local_unnamed_addr global i32, align 4 + +; Function Attrs: nounwind ssp uwtable +define void @f() local_unnamed_addr #0 !dbg !16 { +entry: + %0 = load i32, i32* @p, align 4, !dbg !25 + %tobool = icmp eq i32 %0, 0, !dbg !25 + br i1 %tobool, label %if.end, label %if.then, !dbg !26 + +if.then: ; preds = %entry + %1 = inttoptr i32 %0 to %struct.info*, !dbg !27 + tail call void @llvm.dbg.value(metadata %struct.info* %1, metadata !22, metadata !DIExpression()), !dbg !28 + ; CHECK: call void @llvm.dbg.value(metadata i32 %0, metadata !22, metadata !DIExpression()) + tail call void @llvm.dbg.value(metadata i32 0, metadata !20, metadata !DIExpression()), !dbg !29 + %2 = load i32, i32* @n, align 4, !dbg !30 + %cmp5 = icmp eq i32 %2, 0, !dbg !33 + br i1 %cmp5, label %if.end, label %for.body.preheader, !dbg !34 + +for.body.preheader: ; preds = %if.then + ; CHECK: for.body.preheader: + ; CHECK: %2 = inttoptr i32 %0 to %struct.info* + br label %for.body, !dbg !35 + +for.body: ; preds = %for.body.preheader, %for.body + %lsr.iv = phi %struct.info* [ %1, %for.body.preheader ], [ %scevgep, %for.body ] + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %lsr.iv7 = bitcast %struct.info* %lsr.iv to i64* + tail call void @llvm.dbg.value(metadata i32 %i.06, metadata !20, metadata !DIExpression()), !dbg !29 + %3 = load i64, i64* %lsr.iv7, align 8, !dbg !35 + %call = tail call i32 bitcast (i32 (...)* @use to i32 (i64)*)(i64 %3) #3, !dbg !36 + %inc = add nuw i32 %i.06, 1, !dbg !37 + tail call void @llvm.dbg.value(metadata i32 %inc, metadata !20, metadata !DIExpression()), !dbg !29 + %4 = load i32, i32* @n, align 4, !dbg !30 + %scevgep = getelementptr %struct.info, %struct.info* %lsr.iv, i32 1, !dbg !33 + %cmp = icmp ult i32 %inc, %4, !dbg !33 + br i1 %cmp, label %for.body, label %if.end.loopexit, !dbg !34, !llvm.loop !38 + +if.end.loopexit: ; preds = %for.body + br label %if.end, !dbg !40 + +if.end: ; preds = %if.end.loopexit, %if.then, %entry + ret void, !dbg !40 +} +declare i32 @use(...) local_unnamed_addr #1 + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, metadata, metadata) #2 + +attributes #0 = { nounwind ssp uwtable } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { nobuiltin nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!10, !11, !12, !13, !14} +!llvm.ident = !{!15} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3) +!1 = !DIFile(filename: "debug.i", directory: "/Data/radar/35321562") +!2 = !{} +!3 = !{!4} +!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 32) +!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "info_t", file: !1, line: 3, baseType: !6) +!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "info", file: !1, line: 1, size: 64, elements: !7) +!7 = !{!8} +!8 = !DIDerivedType(tag: DW_TAG_member, name: "size", scope: !6, file: !1, line: 2, baseType: !9, size: 64) +!9 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned) +!10 = !{i32 2, !"Dwarf Version", i32 4} +!11 = !{i32 2, !"Debug Info Version", i32 3} +!12 = !{i32 1, !"wchar_size", i32 4} +!13 = !{i32 1, !"min_enum_size", i32 4} +!14 = !{i32 7, !"PIC Level", i32 2} +!15 = !{!"clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)"} +!16 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 6, type: !17, isLocal: false, isDefinition: true, scopeLine: 6, isOptimized: true, unit: !0, variables: !19) +!17 = !DISubroutineType(types: !18) +!18 = !{null} +!19 = !{!20, !22} +!20 = !DILocalVariable(name: "i", scope: !16, file: !1, line: 7, type: !21) +!21 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) +!22 = !DILocalVariable(name: "info", scope: !23, file: !1, line: 9, type: !4) +!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 8, column: 10) +!24 = distinct !DILexicalBlock(scope: !16, file: !1, line: 8, column: 7) +!25 = !DILocation(line: 8, column: 7, scope: !24) +!26 = !DILocation(line: 8, column: 7, scope: !16) +!27 = !DILocation(line: 9, column: 20, scope: !23) +!28 = !DILocation(line: 9, column: 13, scope: !23) +!29 = !DILocation(line: 7, column: 16, scope: !16) +!30 = !DILocation(line: 10, column: 21, scope: !31) +!31 = distinct !DILexicalBlock(scope: !32, file: !1, line: 10, column: 5) +!32 = distinct !DILexicalBlock(scope: !23, file: !1, line: 10, column: 5) +!33 = !DILocation(line: 10, column: 19, scope: !31) +!34 = !DILocation(line: 10, column: 5, scope: !32) +!35 = !DILocation(line: 11, column: 19, scope: !31) +!36 = !DILocation(line: 11, column: 7, scope: !31) +!37 = !DILocation(line: 10, column: 25, scope: !31) +!38 = distinct !{!38, !34, !39} +!39 = !DILocation(line: 11, column: 23, scope: !32) +!40 = !DILocation(line: 13, column: 1, scope: !16) -- cgit v1.2.3 From 761cb9cc0a2d5422dd22e2a68bbbbc7d374d8247 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Nov 2017 18:02:44 +0000 Subject: [X86] Initialize Type and Subtype in getHostCPUName to 0. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317341 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/Host.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index c167df5a444..40ed87bf40d 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -1057,8 +1057,8 @@ StringRef sys::getHostCPUName() { detectX86FamilyModel(EAX, &Family, &Model); getAvailableFeatures(ECX, EDX, MaxLeaf, &Features, &Features2); - unsigned Type; - unsigned Subtype; + unsigned Type = 0; + unsigned Subtype = 0; if (Vendor == SIG_INTEL) { getIntelProcessorTypeAndSubtype(Family, Model, Brand_id, Features, -- cgit v1.2.3 From aaf1db11f9e3b32446153ce847093dd24fdf8f65 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Nov 2017 18:02:46 +0000 Subject: [CodeGen] Remove unnecessary semicolons to fix a warning. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317342 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/MIRCanonicalizerPass.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/CodeGen/MIRCanonicalizerPass.cpp b/lib/CodeGen/MIRCanonicalizerPass.cpp index 61f9f7e2c5d..09b3a8774cb 100644 --- a/lib/CodeGen/MIRCanonicalizerPass.cpp +++ b/lib/CodeGen/MIRCanonicalizerPass.cpp @@ -101,10 +101,10 @@ char MIRCanonicalizer::ID; char &llvm::MIRCanonicalizerID = MIRCanonicalizer::ID; INITIALIZE_PASS_BEGIN(MIRCanonicalizer, "mir-canonicalizer", - "Rename Register Operands Canonically", false, false); + "Rename Register Operands Canonically", false, false) INITIALIZE_PASS_END(MIRCanonicalizer, "mir-canonicalizer", - "Rename Register Operands Canonically", false, false); + "Rename Register Operands Canonically", false, false) static std::vector GetRPOList(MachineFunction &MF) { ReversePostOrderTraversal RPOT(&*MF.begin()); -- cgit v1.2.3 From 6a8da4f6feecd43764872f3e52a9db813491d266 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 3 Nov 2017 18:26:36 +0000 Subject: Revert "Invoke salvageDebugInfo from CodeGenPrepare's SinkCast()" This reverts commit 317342 while investigating bot breakage. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317345 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CodeGenPrepare.cpp | 1 - lib/Transforms/Utils/Local.cpp | 2 +- .../CodeGenPrepare/salvage-debug-info.ll | 118 --------------------- 3 files changed, 1 insertion(+), 120 deletions(-) delete mode 100644 test/Transforms/CodeGenPrepare/salvage-debug-info.ll diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 73f014704b8..973ddebd987 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -1171,7 +1171,6 @@ static bool SinkCast(CastInst *CI) { // If we removed all uses, nuke the cast. if (CI->use_empty()) { - salvageDebugInfo(*CI); CI->eraseFromParent(); MadeChange = true; } diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index cb7978f76aa..8c643c93ec4 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -1366,7 +1366,7 @@ void llvm::salvageDebugInfo(Instruction &I) { return MetadataAsValue::get(I.getContext(), ValueAsMetadata::get(V)); }; - if (isa(&I) || isa(&I)) { + if (isa(&I)) { findDbgValues(DbgValues, &I); for (auto *DVI : DbgValues) { // Bitcasts are entirely irrelevant for debug info. Rewrite the dbg.value diff --git a/test/Transforms/CodeGenPrepare/salvage-debug-info.ll b/test/Transforms/CodeGenPrepare/salvage-debug-info.ll deleted file mode 100644 index 5509b92a5c1..00000000000 --- a/test/Transforms/CodeGenPrepare/salvage-debug-info.ll +++ /dev/null @@ -1,118 +0,0 @@ -; RUN: opt -codegenprepare -S %s -o - | FileCheck %s -; typedef struct info { -; unsigned long long size; -; } info_t; -; extern unsigned p; -; extern unsigned n; -; void f() { -; unsigned int i; -; if (p) { -; info_t *info = (info_t *)p; -; for (i = 0; i < n; i++) -; use(info[i].size); -; } -; } -source_filename = "debug.i" -target datalayout = "e-m:o-p:32:32-i64:64-a:0:32-n32-S128" -target triple = "thumbv7k-apple-ios10.0.0" - -%struct.info = type { i64 } - -@p = external local_unnamed_addr global i32, align 4 -@n = external local_unnamed_addr global i32, align 4 - -; Function Attrs: nounwind ssp uwtable -define void @f() local_unnamed_addr #0 !dbg !16 { -entry: - %0 = load i32, i32* @p, align 4, !dbg !25 - %tobool = icmp eq i32 %0, 0, !dbg !25 - br i1 %tobool, label %if.end, label %if.then, !dbg !26 - -if.then: ; preds = %entry - %1 = inttoptr i32 %0 to %struct.info*, !dbg !27 - tail call void @llvm.dbg.value(metadata %struct.info* %1, metadata !22, metadata !DIExpression()), !dbg !28 - ; CHECK: call void @llvm.dbg.value(metadata i32 %0, metadata !22, metadata !DIExpression()) - tail call void @llvm.dbg.value(metadata i32 0, metadata !20, metadata !DIExpression()), !dbg !29 - %2 = load i32, i32* @n, align 4, !dbg !30 - %cmp5 = icmp eq i32 %2, 0, !dbg !33 - br i1 %cmp5, label %if.end, label %for.body.preheader, !dbg !34 - -for.body.preheader: ; preds = %if.then - ; CHECK: for.body.preheader: - ; CHECK: %2 = inttoptr i32 %0 to %struct.info* - br label %for.body, !dbg !35 - -for.body: ; preds = %for.body.preheader, %for.body - %lsr.iv = phi %struct.info* [ %1, %for.body.preheader ], [ %scevgep, %for.body ] - %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %lsr.iv7 = bitcast %struct.info* %lsr.iv to i64* - tail call void @llvm.dbg.value(metadata i32 %i.06, metadata !20, metadata !DIExpression()), !dbg !29 - %3 = load i64, i64* %lsr.iv7, align 8, !dbg !35 - %call = tail call i32 bitcast (i32 (...)* @use to i32 (i64)*)(i64 %3) #3, !dbg !36 - %inc = add nuw i32 %i.06, 1, !dbg !37 - tail call void @llvm.dbg.value(metadata i32 %inc, metadata !20, metadata !DIExpression()), !dbg !29 - %4 = load i32, i32* @n, align 4, !dbg !30 - %scevgep = getelementptr %struct.info, %struct.info* %lsr.iv, i32 1, !dbg !33 - %cmp = icmp ult i32 %inc, %4, !dbg !33 - br i1 %cmp, label %for.body, label %if.end.loopexit, !dbg !34, !llvm.loop !38 - -if.end.loopexit: ; preds = %for.body - br label %if.end, !dbg !40 - -if.end: ; preds = %if.end.loopexit, %if.then, %entry - ret void, !dbg !40 -} -declare i32 @use(...) local_unnamed_addr #1 - -; Function Attrs: nounwind readnone speculatable -declare void @llvm.dbg.value(metadata, metadata, metadata) #2 - -attributes #0 = { nounwind ssp uwtable } -attributes #2 = { nounwind readnone speculatable } -attributes #3 = { nobuiltin nounwind } - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!10, !11, !12, !13, !14} -!llvm.ident = !{!15} - -!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3) -!1 = !DIFile(filename: "debug.i", directory: "/Data/radar/35321562") -!2 = !{} -!3 = !{!4} -!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 32) -!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "info_t", file: !1, line: 3, baseType: !6) -!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "info", file: !1, line: 1, size: 64, elements: !7) -!7 = !{!8} -!8 = !DIDerivedType(tag: DW_TAG_member, name: "size", scope: !6, file: !1, line: 2, baseType: !9, size: 64) -!9 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned) -!10 = !{i32 2, !"Dwarf Version", i32 4} -!11 = !{i32 2, !"Debug Info Version", i32 3} -!12 = !{i32 1, !"wchar_size", i32 4} -!13 = !{i32 1, !"min_enum_size", i32 4} -!14 = !{i32 7, !"PIC Level", i32 2} -!15 = !{!"clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)"} -!16 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 6, type: !17, isLocal: false, isDefinition: true, scopeLine: 6, isOptimized: true, unit: !0, variables: !19) -!17 = !DISubroutineType(types: !18) -!18 = !{null} -!19 = !{!20, !22} -!20 = !DILocalVariable(name: "i", scope: !16, file: !1, line: 7, type: !21) -!21 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) -!22 = !DILocalVariable(name: "info", scope: !23, file: !1, line: 9, type: !4) -!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 8, column: 10) -!24 = distinct !DILexicalBlock(scope: !16, file: !1, line: 8, column: 7) -!25 = !DILocation(line: 8, column: 7, scope: !24) -!26 = !DILocation(line: 8, column: 7, scope: !16) -!27 = !DILocation(line: 9, column: 20, scope: !23) -!28 = !DILocation(line: 9, column: 13, scope: !23) -!29 = !DILocation(line: 7, column: 16, scope: !16) -!30 = !DILocation(line: 10, column: 21, scope: !31) -!31 = distinct !DILexicalBlock(scope: !32, file: !1, line: 10, column: 5) -!32 = distinct !DILexicalBlock(scope: !23, file: !1, line: 10, column: 5) -!33 = !DILocation(line: 10, column: 19, scope: !31) -!34 = !DILocation(line: 10, column: 5, scope: !32) -!35 = !DILocation(line: 11, column: 19, scope: !31) -!36 = !DILocation(line: 11, column: 7, scope: !31) -!37 = !DILocation(line: 10, column: 25, scope: !31) -!38 = distinct !{!38, !34, !39} -!39 = !DILocation(line: 11, column: 23, scope: !32) -!40 = !DILocation(line: 13, column: 1, scope: !16) -- cgit v1.2.3 From aba0da108e9400f8cd31655e241d7d6af5f43abe Mon Sep 17 00:00:00 2001 From: Evgeny Stupachenko Date: Fri, 3 Nov 2017 18:50:03 +0000 Subject: The patch fixes PR35131 Summary: Fix a misprint which led to false CTLZ recognition. Reviewers: craig.topper Differential Revision: https://reviews.llvm.org/D39585 From: Evgeny Stupachenko git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317348 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 413fb75d172..eb5f3cc47ce 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1326,9 +1326,9 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, // step 2: detect instructions corresponding to "x.next = x >> 1" if (!DefX || DefX->getOpcode() != Instruction::AShr) return false; - if (ConstantInt *Shft = dyn_cast(DefX->getOperand(1))) - if (!Shft || !Shft->isOne()) - return false; + ConstantInt *Shft = dyn_cast(DefX->getOperand(1)); + if (!Shft || !Shft->isOne()) + return false; VarX = DefX->getOperand(0); // step 3: Check the recurrence of variable X -- cgit v1.2.3 From 8f805056c27cc02d22eb0717d4af9d00e25b9c31 Mon Sep 17 00:00:00 2001 From: Evandro Menezes Date: Fri, 3 Nov 2017 18:56:36 +0000 Subject: [AArch64] Fix the number of iterations for the Newton series The number of iterations was incorrectly determined for DP FP vector types and the tests were insufficient to flag this issue. Differential revision: https://reviews.llvm.org/D39507 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317349 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +- test/CodeGen/AArch64/recp-fastmath.ll | 34 +++++++++--- test/CodeGen/AArch64/sqrt-fastmath.ll | 83 ++++++++++++++++++++++++------ 3 files changed, 94 insertions(+), 25 deletions(-) diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index bec872ae8c0..aabbaf90f68 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4981,7 +4981,7 @@ static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, // the initial estimate is 2^-8. Thus the number of extra steps to refine // the result for float (23 mantissa bits) is 2 and for double (52 // mantissa bits) is 3. - ExtraSteps = VT == MVT::f64 ? 3 : 2; + ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2; return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); } diff --git a/test/CodeGen/AArch64/recp-fastmath.ll b/test/CodeGen/AArch64/recp-fastmath.ll index 38e0fb360e4..4776931cf06 100644 --- a/test/CodeGen/AArch64/recp-fastmath.ll +++ b/test/CodeGen/AArch64/recp-fastmath.ll @@ -18,6 +18,8 @@ define float @frecp1(float %x) #1 { ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frecpe [[R:s[0-7]]] ; CHECK-NEXT: frecps {{s[0-7](, s[0-7])?}}, [[R]] +; CHECK: frecps {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} +; CHECK-NOT: frecps {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} } define <2 x float> @f2recp0(<2 x float> %x) #0 { @@ -38,6 +40,8 @@ define <2 x float> @f2recp1(<2 x float> %x) #1 { ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frecpe [[R:v[0-7]\.2s]] ; CHECK-NEXT: frecps {{v[0-7]\.2s(, v[0-7].2s)?}}, [[R]] +; CHECK: frecps {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} +; CHECK-NOT: frecps {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} } define <4 x float> @f4recp0(<4 x float> %x) #0 { @@ -58,6 +62,8 @@ define <4 x float> @f4recp1(<4 x float> %x) #1 { ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frecpe [[R:v[0-7]\.4s]] ; CHECK-NEXT: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[R]] +; CHECK: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} } define <8 x float> @f8recp0(<8 x float> %x) #0 { @@ -77,10 +83,12 @@ define <8 x float> @f8recp1(<8 x float> %x) #1 { ; CHECK-LABEL: f8recp1: ; CHECK-NEXT: BB#0 -; CHECK-NEXT: frecpe [[RA:v[0-7]\.4s]] -; CHECK-NEXT: frecpe [[RB:v[0-7]\.4s]] -; CHECK-NEXT: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[RA]] -; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[RB]] +; CHECK-NEXT: frecpe [[R:v[0-7]\.4s]] +; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[R]] +; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, {{v[0-7]\.4s}} +; CHECK: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} } define double @drecp0(double %x) #0 { @@ -101,6 +109,9 @@ define double @drecp1(double %x) #1 { ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frecpe [[R:d[0-7]]] ; CHECK-NEXT: frecps {{d[0-7](, d[0-7])?}}, [[R]] +; CHECK: frecps {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK: frecps {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK-NOT: frecps {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} } define <2 x double> @d2recp0(<2 x double> %x) #0 { @@ -121,6 +132,9 @@ define <2 x double> @d2recp1(<2 x double> %x) #1 { ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frecpe [[R:v[0-7]\.2d]] ; CHECK-NEXT: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[R]] +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} } define <4 x double> @d4recp0(<4 x double> %x) #0 { @@ -140,10 +154,14 @@ define <4 x double> @d4recp1(<4 x double> %x) #1 { ; CHECK-LABEL: d4recp1: ; CHECK-NEXT: BB#0 -; CHECK-NEXT: frecpe [[RA:v[0-7]\.2d]] -; CHECK-NEXT: frecpe [[RB:v[0-7]\.2d]] -; CHECK-NEXT: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[RA]] -; CHECK: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[RB]] +; CHECK-NEXT: frecpe [[R:v[0-7]\.2d]] +; CHECK: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[R]] +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} } attributes #0 = { nounwind "unsafe-fp-math"="true" } diff --git a/test/CodeGen/AArch64/sqrt-fastmath.ll b/test/CodeGen/AArch64/sqrt-fastmath.ll index 079562c0581..4dd0516faf0 100644 --- a/test/CodeGen/AArch64/sqrt-fastmath.ll +++ b/test/CodeGen/AArch64/sqrt-fastmath.ll @@ -22,7 +22,9 @@ define float @fsqrt(float %a) #0 { ; CHECK-NEXT: frsqrte [[RA:s[0-7]]] ; CHECK-NEXT: fmul [[RB:s[0-7]]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{s[0-7](, s[0-7])?}}, [[RB]] -; CHECK: fcmp s0, #0 +; CHECK: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} +; CHECK-NOT: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} +; CHECK: fcmp {{s[0-7]}}, #0 } define <2 x float> @f2sqrt(<2 x float> %a) #0 { @@ -38,7 +40,9 @@ define <2 x float> @f2sqrt(<2 x float> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2s]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.2s]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.2s(, v[0-7]\.2s)?}}, [[RB]] -; CHECK: fcmeq {{v[0-7]\.2s, v0\.2s}}, #0 +; CHECK: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} +; CHECK-NOT: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} +; CHECK: fcmeq {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, #0 } define <4 x float> @f4sqrt(<4 x float> %a) #0 { @@ -54,7 +58,9 @@ define <4 x float> @f4sqrt(<4 x float> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]] -; CHECK: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0 +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0 } define <8 x float> @f8sqrt(<8 x float> %a) #0 { @@ -69,9 +75,16 @@ define <8 x float> @f8sqrt(<8 x float> %a) #0 { ; CHECK-LABEL: f8sqrt: ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]] -; CHECK: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]] -; CHECK: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]] -; CHECK: fcmeq {{v[0-7]\.4s, v[0-1]\.4s}}, #0 +; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]] +; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]] +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0 +; CHECK: frsqrte [[RC:v[0-7]\.4s]] +; CHECK-NEXT: fmul [[RD:v[0-7]\.4s]], [[RC]], [[RC]] +; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RD]] +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0 } define double @dsqrt(double %a) #0 { @@ -87,7 +100,10 @@ define double @dsqrt(double %a) #0 { ; CHECK-NEXT: frsqrte [[RA:d[0-7]]] ; CHECK-NEXT: fmul [[RB:d[0-7]]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{d[0-7](, d[0-7])?}}, [[RB]] -; CHECK: fcmp d0, #0 +; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK-NOT: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK: fcmp {{d[0-7]}}, #0 } define <2 x double> @d2sqrt(<2 x double> %a) #0 { @@ -103,7 +119,10 @@ define <2 x double> @d2sqrt(<2 x double> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]] -; CHECK: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0 +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0 } define <4 x double> @d4sqrt(<4 x double> %a) #0 { @@ -118,9 +137,19 @@ define <4 x double> @d4sqrt(<4 x double> %a) #0 { ; CHECK-LABEL: d4sqrt: ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]] -; CHECK: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]] -; CHECK: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]] -; CHECK: fcmeq {{v[0-7]\.2d, v[0-1]\.2d}}, #0 +; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]] +; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]] +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0 +; CHECK: frsqrte [[RC:v[0-7]\.2d]] +; CHECK-NEXT: fmul [[RD:v[0-7]\.2d]], [[RC]], [[RC]] +; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RD]] +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0 } define float @frsqrt(float %a) #0 { @@ -137,6 +166,8 @@ define float @frsqrt(float %a) #0 { ; CHECK-NEXT: frsqrte [[RA:s[0-7]]] ; CHECK-NEXT: fmul [[RB:s[0-7]]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{s[0-7](, s[0-7])?}}, [[RB]] +; CHECK: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} +; CHECK-NOT: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} ; CHECK-NOT: fcmp {{s[0-7]}}, #0 } @@ -154,7 +185,9 @@ define <2 x float> @f2rsqrt(<2 x float> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2s]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.2s]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.2s(, v[0-7]\.2s)?}}, [[RB]] -; CHECK-NOT: fcmeq {{v[0-7]\.2s, v0\.2s}}, #0 +; CHECK: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} +; CHECK-NOT: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} +; CHECK-NOT: fcmeq {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, #0 } define <4 x float> @f4rsqrt(<4 x float> %a) #0 { @@ -171,7 +204,9 @@ define <4 x float> @f4rsqrt(<4 x float> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]] -; CHECK-NOT: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0 +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0 } define <8 x float> @f8rsqrt(<8 x float> %a) #0 { @@ -189,7 +224,11 @@ define <8 x float> @f8rsqrt(<8 x float> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]] ; CHECK: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]] ; CHECK: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]] -; CHECK-NOT: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0 +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0 } define double @drsqrt(double %a) #0 { @@ -206,6 +245,9 @@ define double @drsqrt(double %a) #0 { ; CHECK-NEXT: frsqrte [[RA:d[0-7]]] ; CHECK-NEXT: fmul [[RB:d[0-7]]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{d[0-7](, d[0-7])?}}, [[RB]] +; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK-NOT: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} ; CHECK-NOT: fcmp d0, #0 } @@ -223,7 +265,10 @@ define <2 x double> @d2rsqrt(<2 x double> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]] -; CHECK-NOT: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0 +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0 } define <4 x double> @d4rsqrt(<4 x double> %a) #0 { @@ -241,7 +286,13 @@ define <4 x double> @d4rsqrt(<4 x double> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]] ; CHECK: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]] ; CHECK: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]] -; CHECK-NOT: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0 +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0 } attributes #0 = { nounwind "unsafe-fp-math"="true" } -- cgit v1.2.3 From a8631b87aef95da6cd44dd94508c4f37c26b4867 Mon Sep 17 00:00:00 2001 From: Jake Ehrlich Date: Fri, 3 Nov 2017 18:58:41 +0000 Subject: [llvm-objcopy] Add support for dwarf fission This change adds support for dwarf fission. Differential Revision: https://reviews.llvm.org/D39207 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317350 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-objcopy/Inputs/dwarf.dwo | Bin 0 -> 3568 bytes test/tools/llvm-objcopy/drawf-fission.test | 43 +++++++++++++++ tools/llvm-objcopy/Object.h | 1 + tools/llvm-objcopy/llvm-objcopy.cpp | 81 +++++++++++++++++++++++------ 4 files changed, 110 insertions(+), 15 deletions(-) create mode 100644 test/tools/llvm-objcopy/Inputs/dwarf.dwo create mode 100644 test/tools/llvm-objcopy/drawf-fission.test diff --git a/test/tools/llvm-objcopy/Inputs/dwarf.dwo b/test/tools/llvm-objcopy/Inputs/dwarf.dwo new file mode 100644 index 00000000000..4b6fd505506 Binary files /dev/null and b/test/tools/llvm-objcopy/Inputs/dwarf.dwo differ diff --git a/test/tools/llvm-objcopy/drawf-fission.test b/test/tools/llvm-objcopy/drawf-fission.test new file mode 100644 index 00000000000..112bffbc891 --- /dev/null +++ b/test/tools/llvm-objcopy/drawf-fission.test @@ -0,0 +1,43 @@ +# RUN: llvm-objcopy -extract-dwo %p/Inputs/dwarf.dwo %t +# RUN: llvm-objcopy -strip-dwo %p/Inputs/dwarf.dwo %t2 +# RUN: llvm-objcopy -split-dwo=%t3 %p/Inputs/dwarf.dwo %t4 +# RUN: llvm-readobj -file-headers -sections %t | FileCheck %s -check-prefix=DWARF +# RUN: llvm-readobj -file-headers -sections %t2 | FileCheck %s -check-prefix=STRIP +# RUN: diff %t %t3 +# RUN: diff %t2 %t4 + +#DWARF: SectionHeaderCount: 8 + +#DWARF: Name: .debug_loc.dwo +#DWARF: Name: .debug_str.dwo +#DWARF: Name: .debug_str_offsets.dwo +#DWARF: Name: .debug_info.dwo +#DWARF: Name: .debug_abbrev.dwo +#DWARF: Name: .debug_line.dwo +#DWARF: Name: .strtab + +#STRIP: SectionHeaderCount: 24 + +#STRIP: Name: .text +#STRIP: Name: .rodata.str1.1 +#STRIP: Name: .debug_str +#STRIP: Name: .debug_abbrev +#STRIP: Name: .debug_info +#STRIP: Name: .debug_ranges +#STRIP: Name: .debug_macinfo +#STRIP: Name: .debug_addr +#STRIP: Name: .debug_pubnames +#STRIP: Name: .debug_pubtypes +#STRIP: Name: .comment +#STRIP: Name: .note.GNU-stack +#STRIP: Name: .debug_frame +#STRIP: Name: .debug_line +#STRIP: Name: .symtab +#STRIP: Name: .rela.text +#STRIP: Name: .rela.debug_info +#STRIP: Name: .rela.debug_addr +#STRIP: Name: .rela.debug_pubnames +#STRIP: Name: .rela.debug_pubtypes +#STRIP: Name: .rela.debug_frame +#STRIP: Name: .rela.debug_line +#STRIP: Name: .strtab diff --git a/tools/llvm-objcopy/Object.h b/tools/llvm-objcopy/Object.h index 9c77f5900ce..f12e6da7d21 100644 --- a/tools/llvm-objcopy/Object.h +++ b/tools/llvm-objcopy/Object.h @@ -368,6 +368,7 @@ public: Object(const object::ELFObjectFile &Obj); virtual ~Object() = default; + const SectionBase *getSectionHeaderStrTab() const { return SectionNames; } void removeSections(std::function ToRemove); virtual size_t totalSize() const = 0; virtual void finalize() = 0; diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp index f3e9c7750a6..52091d3e183 100644 --- a/tools/llvm-objcopy/llvm-objcopy.cpp +++ b/tools/llvm-objcopy/llvm-objcopy.cpp @@ -83,12 +83,63 @@ static cl::alias ToRemoveA("R", cl::desc("Alias for remove-section"), cl::aliasopt(ToRemove)); static cl::opt StripSections("strip-sections", cl::desc("Remove all section headers")); +static cl::opt + StripDWO("strip-dwo", cl::desc("remove all DWARF .dwo sections from file")); +static cl::opt ExtractDWO( + "extract-dwo", + cl::desc("remove all sections that are not DWARF .dwo sections from file")); +static cl::opt + SplitDWO("split-dwo", + cl::desc("equivalent to extract-dwo on the input file to " + ", then strip-dwo on the input file"), + cl::value_desc("dwo-file")); using SectionPred = std::function; -void CopyBinary(const ELFObjectFile &ObjFile) { +bool IsDWOSection(const SectionBase &Sec) { + return Sec.Name.endswith(".dwo"); +} + +template +bool OnlyKeepDWOPred(const Object &Obj, const SectionBase &Sec) { + // We can't remove the section header string table. + if (&Sec == Obj.getSectionHeaderStrTab()) + return false; + // Short of keeping the string table we want to keep everything that is a DWO + // section and remove everything else. + return !IsDWOSection(Sec); +} + +template +void WriteObjectFile(const Object &Obj, StringRef File) { std::unique_ptr Buffer; + ErrorOr> BufferOrErr = + FileOutputBuffer::create(File, Obj.totalSize(), + FileOutputBuffer::F_executable); + if (BufferOrErr.getError()) + error("failed to open " + OutputFilename); + else + Buffer = std::move(*BufferOrErr); + Obj.write(*Buffer); + if (auto EC = Buffer->commit()) + reportError(File, EC); +} + +template +void SplitDWOToFile(const ELFObjectFile &ObjFile, StringRef File) { + // Construct a second output file for the DWO sections. + ELFObject DWOFile(ObjFile); + + DWOFile.removeSections([&](const SectionBase &Sec) { + return OnlyKeepDWOPred(DWOFile, Sec); + }); + DWOFile.finalize(); + WriteObjectFile(DWOFile, File); +} + +void CopyBinary(const ELFObjectFile &ObjFile) { std::unique_ptr> Obj; + if (!OutputFormat.empty() && OutputFormat != "binary") error("invalid output format '" + OutputFormat + "'"); if (!OutputFormat.empty() && OutputFormat == "binary") @@ -96,6 +147,9 @@ void CopyBinary(const ELFObjectFile &ObjFile) { else Obj = llvm::make_unique>(ObjFile); + if (!SplitDWO.empty()) + SplitDWOToFile(ObjFile, SplitDWO.getValue()); + SectionPred RemovePred = [](const SectionBase &) { return false; }; if (!ToRemove.empty()) { @@ -105,6 +159,16 @@ void CopyBinary(const ELFObjectFile &ObjFile) { }; } + if (StripDWO || !SplitDWO.empty()) + RemovePred = [RemovePred, &Obj](const SectionBase &Sec) { + return IsDWOSection(Sec) || RemovePred(Sec); + }; + + if (ExtractDWO) + RemovePred = [RemovePred, &Obj](const SectionBase &Sec) { + return OnlyKeepDWOPred(*Obj, Sec) || RemovePred(Sec); + }; + if (StripSections) { RemovePred = [RemovePred](const SectionBase &Sec) { return RemovePred(Sec) || (Sec.Flags & SHF_ALLOC) == 0; @@ -113,21 +177,8 @@ void CopyBinary(const ELFObjectFile &ObjFile) { } Obj->removeSections(RemovePred); - Obj->finalize(); - ErrorOr> BufferOrErr = - FileOutputBuffer::create(OutputFilename, Obj->totalSize(), - FileOutputBuffer::F_executable); - if (BufferOrErr.getError()) - error("failed to open " + OutputFilename); - else - Buffer = std::move(*BufferOrErr); - std::error_code EC; - if (EC) - report_fatal_error(EC.message()); - Obj->write(*Buffer); - if (auto EC = Buffer->commit()) - reportError(OutputFilename, EC); + WriteObjectFile(*Obj, OutputFilename.getValue()); } int main(int argc, char **argv) { -- cgit v1.2.3 From 1b91c5e8aad019b3b3649db6c496b74739b4e5d2 Mon Sep 17 00:00:00 2001 From: Jun Bum Lim Date: Fri, 3 Nov 2017 19:01:57 +0000 Subject: Add CallSiteSplitting pass Summary: This change add a pass which tries to split a call-site to pass more constrained arguments if its argument is predicated in the control flow so that we can expose better context to the later passes (e.g, inliner, jump threading, or IPA-CP based function cloning, etc.). As of now we support two cases : 1) If a call site is dominated by an OR condition and if any of its arguments are predicated on this OR condition, try to split the condition with more constrained arguments. For example, in the code below, we try to split the call site since we can predicate the argument (ptr) based on the OR condition. Split from : if (!ptr || c) callee(ptr); to : if (!ptr) callee(null ptr) // set the known constant value else if (c) callee(nonnull ptr) // set non-null attribute in the argument 2) We can also split a call-site based on constant incoming values of a PHI For example, from : BB0: %c = icmp eq i32 %i1, %i2 br i1 %c, label %BB2, label %BB1 BB1: br label %BB2 BB2: %p = phi i32 [ 0, %BB0 ], [ 1, %BB1 ] call void @bar(i32 %p) to BB0: %c = icmp eq i32 %i1, %i2 br i1 %c, label %BB2-split0, label %BB1 BB1: br label %BB2-split1 BB2-split0: call void @bar(i32 0) br label %BB2 BB2-split1: call void @bar(i32 1) br label %BB2 BB2: %p = phi i32 [ 0, %BB2-split0 ], [ 1, %BB2-split1 ] Reviewers: davidxl, huntergr, chandlerc, mcrosier, eraman, davide Reviewed By: davidxl Subscribers: sdesmalen, ashutosh.nema, fhahn, mssimpso, aemerson, mgorny, mehdi_amini, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D39137 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317351 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/InitializePasses.h | 1 + include/llvm/Transforms/Scalar.h | 8 + include/llvm/Transforms/Scalar/CallSiteSplitting.h | 29 ++ lib/Passes/PassBuilder.cpp | 9 +- lib/Passes/PassRegistry.def | 1 + lib/Transforms/IPO/PassManagerBuilder.cpp | 6 + lib/Transforms/Scalar/CMakeLists.txt | 1 + lib/Transforms/Scalar/CallSiteSplitting.cpp | 492 +++++++++++++++++++++ lib/Transforms/Scalar/Scalar.cpp | 1 + test/Other/new-pm-defaults.ll | 1 + test/Other/new-pm-lto-defaults.ll | 9 +- test/Other/new-pm-thinlto-defaults.ll | 1 + .../CallSiteSplitting/callsite-split-or-phi.ll | 339 ++++++++++++++ .../Transforms/CallSiteSplitting/callsite-split.ll | 119 +++++ 14 files changed, 1014 insertions(+), 3 deletions(-) create mode 100644 include/llvm/Transforms/Scalar/CallSiteSplitting.h create mode 100644 lib/Transforms/Scalar/CallSiteSplitting.cpp create mode 100644 test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll create mode 100644 test/Transforms/CallSiteSplitting/callsite-split.ll diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index b8183d1c8e2..9cdb49330ae 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -80,6 +80,7 @@ void initializeBranchFolderPassPass(PassRegistry&); void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry&); void initializeBranchRelaxationPass(PassRegistry&); void initializeBreakCriticalEdgesPass(PassRegistry&); +void initializeCallSiteSplittingLegacyPassPass(PassRegistry&); void initializeCFGOnlyPrinterLegacyPassPass(PassRegistry&); void initializeCFGOnlyViewerLegacyPassPass(PassRegistry&); void initializeCFGPrinterLegacyPassPass(PassRegistry&); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index a78c897683f..0cf1115dc97 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -73,6 +73,14 @@ FunctionPass *createDeadCodeEliminationPass(); // FunctionPass *createDeadStoreEliminationPass(); + +//===----------------------------------------------------------------------===// +// +// CallSiteSplitting - This pass split call-site based on its known argument +// values. +FunctionPass *createCallSiteSplittingPass(); + + //===----------------------------------------------------------------------===// // // AggressiveDCE - This pass uses the SSA based Aggressive DCE algorithm. This diff --git a/include/llvm/Transforms/Scalar/CallSiteSplitting.h b/include/llvm/Transforms/Scalar/CallSiteSplitting.h new file mode 100644 index 00000000000..5ab951a49f2 --- /dev/null +++ b/include/llvm/Transforms/Scalar/CallSiteSplitting.h @@ -0,0 +1,29 @@ +//===- CallSiteSplitting..h - Callsite Splitting ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H +#define LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H + +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/Compiler.h" +#include + +namespace llvm { + +struct CallSiteSplittingPass : PassInfoMixin { + /// \brief Run the pass over the function. + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index 21d95a07125..2088ea0cea2 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -89,6 +89,7 @@ #include "llvm/Transforms/Scalar/ADCE.h" #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" #include "llvm/Transforms/Scalar/BDCE.h" +#include "llvm/Transforms/Scalar/CallSiteSplitting.h" #include "llvm/Transforms/Scalar/ConstantHoisting.h" #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" #include "llvm/Transforms/Scalar/DCE.h" @@ -548,6 +549,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, EarlyFPM.addPass(SROA()); EarlyFPM.addPass(EarlyCSEPass()); EarlyFPM.addPass(LowerExpectIntrinsicPass()); + if (Level == O3) + EarlyFPM.addPass(CallSiteSplittingPass()); + // In SamplePGO ThinLTO backend, we need instcombine before profile annotation // to convert bitcast to direct calls so that they can be inlined during the // profile annotation prepration step. @@ -920,13 +924,16 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, MPM.addPass(InferFunctionAttrsPass()); if (Level > 1) { + FunctionPassManager EarlyFPM(DebugLogging); + EarlyFPM.addPass(CallSiteSplittingPass()); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM))); + // Indirect call promotion. This should promote all the targets that are // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should // produce the same result as if we only do promotion here. MPM.addPass(PGOIndirectCallPromotion( true /* InLTO */, PGOOpt && !PGOOpt->SampleProfileFile.empty())); - // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function // pointers passed as arguments to direct uses of functions. diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def index 20d1220ac33..40b884351fd 100644 --- a/lib/Passes/PassRegistry.def +++ b/lib/Passes/PassRegistry.def @@ -140,6 +140,7 @@ FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass()) FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass()) FUNCTION_PASS("bdce", BDCEPass()) FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass()) +FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass()) FUNCTION_PASS("consthoist", ConstantHoistingPass()) FUNCTION_PASS("correlated-propagation", CorrelatedValuePropagationPass()) FUNCTION_PASS("dce", DCEPass()) diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 828eb5eee29..b8ff614f7c8 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -467,6 +467,9 @@ void PassManagerBuilder::populateModulePassManager( addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); + if (OptLevel > 2) + MPM.add(createCallSiteSplittingPass()); + MPM.add(createIPSCCPPass()); // IP SCCP MPM.add(createCalledValuePropagationPass()); MPM.add(createGlobalOptimizerPass()); // Optimize out global vars @@ -703,6 +706,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createInferFunctionAttrsLegacyPass()); if (OptLevel > 1) { + // Split call-site with more constrained arguments. + PM.add(createCallSiteSplittingPass()); + // Indirect call promotion. This should promote all the targets that are // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index d79ae851005..6a27fbca8b7 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -2,6 +2,7 @@ add_llvm_library(LLVMScalarOpts ADCE.cpp AlignmentFromAssumptions.cpp BDCE.cpp + CallSiteSplitting.cpp ConstantHoisting.cpp ConstantProp.cpp CorrelatedValuePropagation.cpp diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp new file mode 100644 index 00000000000..251e3322359 --- /dev/null +++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -0,0 +1,492 @@ +//===- CallSiteSplitting.cpp ----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a transformation that tries to split a call-site to pass +// more constrained arguments if its argument is predicated in the control flow +// so that we can expose better context to the later passes (e.g, inliner, jump +// threading, or IPA-CP based function cloning, etc.). +// As of now we support two cases : +// +// 1) If a call site is dominated by an OR condition and if any of its arguments +// are predicated on this OR condition, try to split the condition with more +// constrained arguments. For example, in the code below, we try to split the +// call site since we can predicate the argument(ptr) based on the OR condition. +// +// Split from : +// if (!ptr || c) +// callee(ptr); +// to : +// if (!ptr) +// callee(null) // set the known constant value +// else if (c) +// callee(nonnull ptr) // set non-null attribute in the argument +// +// 2) We can also split a call-site based on constant incoming values of a PHI +// For example, +// from : +// Header: +// %c = icmp eq i32 %i1, %i2 +// br i1 %c, label %Tail, label %TBB +// TBB: +// br label Tail% +// Tail: +// %p = phi i32 [ 0, %Header], [ 1, %TBB] +// call void @bar(i32 %p) +// to +// Header: +// %c = icmp eq i32 %i1, %i2 +// br i1 %c, label %Tail-split0, label %TBB +// TBB: +// br label %Tail-split1 +// Tail-split0: +// call void @bar(i32 0) +// br label %Tail +// Tail-split1: +// call void @bar(i32 1) +// br label %Tail +// Tail: +// %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ] +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/CallSiteSplitting.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "callsite-splitting" + +STATISTIC(NumCallSiteSplit, "Number of call-site split"); + +static void addNonNullAttribute(Instruction *CallI, Instruction *&NewCallI, + Value *Op) { + if (!NewCallI) + NewCallI = CallI->clone(); + CallSite CS(NewCallI); + unsigned ArgNo = 0; + for (auto &I : CS.args()) { + if (&*I == Op) + CS.addParamAttr(ArgNo, Attribute::NonNull); + ++ArgNo; + } +} + +static void setConstantInArgument(Instruction *CallI, Instruction *&NewCallI, + Value *Op, Constant *ConstValue) { + if (!NewCallI) + NewCallI = CallI->clone(); + CallSite CS(NewCallI); + unsigned ArgNo = 0; + for (auto &I : CS.args()) { + if (&*I == Op) + CS.setArgument(ArgNo, ConstValue); + ++ArgNo; + } +} + +static bool createCallSitesOnOrPredicatedArgument( + CallSite CS, Instruction *&NewCSTakenFromHeader, + Instruction *&NewCSTakenFromNextCond, + SmallVectorImpl &BranchInsts, BasicBlock *HeaderBB) { + assert(BranchInsts.size() <= 2 && + "Unexpected number of blocks in the OR predicated condition"); + Instruction *Instr = CS.getInstruction(); + BasicBlock *CallSiteBB = Instr->getParent(); + TerminatorInst *HeaderTI = HeaderBB->getTerminator(); + bool IsCSInTakenPath = CallSiteBB == HeaderTI->getSuccessor(0); + + for (unsigned I = 0, E = BranchInsts.size(); I != E; ++I) { + BranchInst *PBI = BranchInsts[I]; + assert(isa(PBI->getCondition()) && + "Unexpected condition in a conditional branch."); + ICmpInst *Cmp = cast(PBI->getCondition()); + Value *Arg = Cmp->getOperand(0); + assert(isa(Cmp->getOperand(1)) && + "Expected op1 to be a constant."); + Constant *ConstVal = cast(Cmp->getOperand(1)); + CmpInst::Predicate Pred = Cmp->getPredicate(); + + if (PBI->getParent() == HeaderBB) { + Instruction *&CallTakenFromHeader = + IsCSInTakenPath ? NewCSTakenFromHeader : NewCSTakenFromNextCond; + Instruction *&CallUntakenFromHeader = + IsCSInTakenPath ? NewCSTakenFromNextCond : NewCSTakenFromHeader; + + assert(Pred == ICmpInst::ICMP_EQ || + Pred == ICmpInst::ICMP_NE && + "Unexpected predicate in an OR condition"); + + // Set the constant value for agruments in the call predicated based on + // the OR condition. + Instruction *&CallToSetConst = Pred == ICmpInst::ICMP_EQ + ? CallTakenFromHeader + : CallUntakenFromHeader; + setConstantInArgument(Instr, CallToSetConst, Arg, ConstVal); + + // Add the NonNull attribute if compared with the null pointer. + if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) { + Instruction *&CallToSetAttr = Pred == ICmpInst::ICMP_EQ + ? CallUntakenFromHeader + : CallTakenFromHeader; + addNonNullAttribute(Instr, CallToSetAttr, Arg); + } + continue; + } + + if (Pred == ICmpInst::ICMP_EQ) { + if (PBI->getSuccessor(0) == Instr->getParent()) { + // Set the constant value for the call taken from the second block in + // the OR condition. + setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal); + } else { + // Add the NonNull attribute if compared with the null pointer for the + // call taken from the second block in the OR condition. + if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) + addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg); + } + } else { + if (PBI->getSuccessor(0) == Instr->getParent()) { + // Add the NonNull attribute if compared with the null pointer for the + // call taken from the second block in the OR condition. + if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) + addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg); + } else if (Pred == ICmpInst::ICMP_NE) { + // Set the constant value for the call in the untaken path from the + // header block. + setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal); + } else + llvm_unreachable("Unexpected condition"); + } + } + return NewCSTakenFromHeader || NewCSTakenFromNextCond; +} + +static bool canSplitCallSite(CallSite CS) { + // FIXME: As of now we handle only CallInst. InvokeInst could be handled + // without too much effort. + Instruction *Instr = CS.getInstruction(); + if (!isa(Instr)) + return false; + + // Allow splitting a call-site only when there is no instruction before the + // call-site in the basic block. Based on this constraint, we only clone the + // call instruction, and we do not move a call-site across any other + // instruction. + BasicBlock *CallSiteBB = Instr->getParent(); + if (Instr != CallSiteBB->getFirstNonPHI()) + return false; + + pred_iterator PII = pred_begin(CallSiteBB); + pred_iterator PIE = pred_end(CallSiteBB); + unsigned NumPreds = std::distance(PII, PIE); + + // Allow only one extra call-site. No more than two from one call-site. + if (NumPreds != 2) + return false; + + // Cannot split an edge from an IndirectBrInst. + BasicBlock *Preds[2] = {*PII++, *PII}; + if (isa(Preds[0]->getTerminator()) || + isa(Preds[1]->getTerminator())) + return false; + + return CallSiteBB->canSplitPredecessors(); +} + +/// Return true if the CS is split into its new predecessors which are directly +/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2. +/// Note that PredBB1 and PredBB2 are decided in findPredicatedArgument(), +/// especially for the OR predicated case where PredBB1 will point the header, +/// and PredBB2 will point the the second compare block. CallInst1 and CallInst2 +/// will be the new call-sites placed in the new predecessors split for PredBB1 +/// and PredBB2, repectively. Therefore, CallInst1 will be the call-site placed +/// between Header and Tail, and CallInst2 will be the call-site between TBB and +/// Tail. For example, in the IR below with an OR condition, the call-site can +/// be split +/// +/// from : +/// +/// Header: +/// %c = icmp eq i32* %a, null +/// br i1 %c %Tail, %TBB +/// TBB: +/// %c2 = icmp eq i32* %b, null +/// br i1 %c %Tail, %End +/// Tail: +/// %ca = call i1 @callee (i32* %a, i32* %b) +/// +/// to : +/// +/// Header: // PredBB1 is Header +/// %c = icmp eq i32* %a, null +/// br i1 %c %Tail-split1, %TBB +/// TBB: // PredBB2 is TBB +/// %c2 = icmp eq i32* %b, null +/// br i1 %c %Tail-split2, %End +/// Tail-split1: +/// %ca1 = call @callee (i32* null, i32* %b) // CallInst1 +/// br %Tail +/// Tail-split2: +/// %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2 +/// br %Tail +/// Tail: +/// %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2] +/// +/// Note that for an OR predicated case, CallInst1 and CallInst2 should be +/// created with more constrained arguments in +/// createCallSitesOnOrPredicatedArgument(). +static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2, + Instruction *CallInst1, Instruction *CallInst2) { + Instruction *Instr = CS.getInstruction(); + BasicBlock *TailBB = Instr->getParent(); + assert(Instr == (TailBB->getFirstNonPHI()) && "Unexpected call-site"); + + BasicBlock *SplitBlock1 = + SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split"); + BasicBlock *SplitBlock2 = + SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split"); + + assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split."); + + if (!CallInst1) + CallInst1 = Instr->clone(); + if (!CallInst2) + CallInst2 = Instr->clone(); + + CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt()); + CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt()); + + CallSite CS1(CallInst1); + CallSite CS2(CallInst2); + + // Handle PHIs used as arguments in the call-site. + for (auto &PI : *TailBB) { + PHINode *PN = dyn_cast(&PI); + if (!PN) + break; + unsigned ArgNo = 0; + for (auto &CI : CS.args()) { + if (&*CI == PN) { + CS1.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock1)); + CS2.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock2)); + } + ++ArgNo; + } + } + + // Replace users of the original call with a PHI mering call-sites split. + if (Instr->getNumUses()) { + PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call", Instr); + PN->addIncoming(CallInst1, SplitBlock1); + PN->addIncoming(CallInst2, SplitBlock2); + Instr->replaceAllUsesWith(PN); + } + DEBUG(dbgs() << "split call-site : " << *Instr << " into \n"); + DEBUG(dbgs() << " " << *CallInst1 << " in " << SplitBlock1->getName() + << "\n"); + DEBUG(dbgs() << " " << *CallInst2 << " in " << SplitBlock2->getName() + << "\n"); + Instr->eraseFromParent(); + NumCallSiteSplit++; +} + +static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) { + assert(isa(Cmp->getOperand(1)) && "Expected a constant operand."); + Value *Op0 = Cmp->getOperand(0); + unsigned ArgNo = 0; + for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; + ++I, ++ArgNo) { + // Don't consider constant or arguments that are already known non-null. + if (isa(*I) || CS.paramHasAttr(ArgNo, Attribute::NonNull)) + continue; + + if (*I == Op0) + return true; + } + return false; +} + +static void findOrCondRelevantToCallArgument( + CallSite CS, BasicBlock *PredBB, BasicBlock *OtherPredBB, + SmallVectorImpl &BranchInsts, BasicBlock *&HeaderBB) { + auto *PBI = dyn_cast(PredBB->getTerminator()); + if (!PBI || !PBI->isConditional()) + return; + + if (PBI->getSuccessor(0) == OtherPredBB || + PBI->getSuccessor(1) == OtherPredBB) + if (PredBB == OtherPredBB->getSinglePredecessor()) { + assert(!HeaderBB && "Expect to find only a single header block"); + HeaderBB = PredBB; + } + + CmpInst::Predicate Pred; + Value *Cond = PBI->getCondition(); + if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant()))) + return; + ICmpInst *Cmp = cast(Cond); + if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) + if (isCondRelevantToAnyCallArgument(Cmp, CS)) + BranchInsts.push_back(PBI); +} + +// Return true if the call-site has an argument which is a PHI with only +// constant incoming values. +static bool isPredicatedOnPHI(CallSite CS) { + Instruction *Instr = CS.getInstruction(); + BasicBlock *Parent = Instr->getParent(); + if (Instr != Parent->getFirstNonPHI()) + return false; + + for (auto &BI : *Parent) { + if (PHINode *PN = dyn_cast(&BI)) { + for (auto &I : CS.args()) + if (&*I == PN) { + assert(PN->getNumIncomingValues() == 2 && + "Unexpected number of incoming values"); + if (PN->getIncomingBlock(0) == PN->getIncomingBlock(1)) + return false; + if (PN->getIncomingValue(0) == PN->getIncomingValue(1)) + continue; + if (isa(PN->getIncomingValue(0)) && + isa(PN->getIncomingValue(1))) + return true; + } + } + break; + } + return false; +} + +// Return true if an agument in CS is predicated on an 'or' condition. +// Create new call-site with arguments constrained based on the OR condition. +static bool findPredicatedOnOrCondition(CallSite CS, BasicBlock *PredBB1, + BasicBlock *PredBB2, + Instruction *&NewCallTakenFromHeader, + Instruction *&NewCallTakenFromNextCond, + BasicBlock *&HeaderBB) { + SmallVector BranchInsts; + findOrCondRelevantToCallArgument(CS, PredBB1, PredBB2, BranchInsts, HeaderBB); + findOrCondRelevantToCallArgument(CS, PredBB2, PredBB1, BranchInsts, HeaderBB); + if (BranchInsts.empty() || !HeaderBB) + return false; + + // If an OR condition is detected, try to create call sites with constrained + // arguments (e.g., NonNull attribute or constant value). + return createCallSitesOnOrPredicatedArgument(CS, NewCallTakenFromHeader, + NewCallTakenFromNextCond, + BranchInsts, HeaderBB); +} + +static bool findPredicatedArgument(CallSite CS, Instruction *&CallInst1, + Instruction *&CallInst2, + BasicBlock *&PredBB1, BasicBlock *&PredBB2) { + BasicBlock *CallSiteBB = CS.getInstruction()->getParent(); + pred_iterator PII = pred_begin(CallSiteBB); + pred_iterator PIE = pred_end(CallSiteBB); + assert(std::distance(PII, PIE) == 2 && "Expect only two predecessors."); + BasicBlock *Preds[2] = {*PII++, *PII}; + BasicBlock *&HeaderBB = PredBB1; + if (!findPredicatedOnOrCondition(CS, Preds[0], Preds[1], CallInst1, CallInst2, + HeaderBB) && + !isPredicatedOnPHI(CS)) + return false; + + if (!PredBB1) + PredBB1 = Preds[0]; + + PredBB2 = PredBB1 == Preds[0] ? Preds[1] : Preds[0]; + return true; +} + +static bool tryToSplitCallSite(CallSite CS) { + if (!CS.arg_size()) + return false; + + BasicBlock *PredBB1 = nullptr; + BasicBlock *PredBB2 = nullptr; + Instruction *CallInst1 = nullptr; + Instruction *CallInst2 = nullptr; + if (!canSplitCallSite(CS) || + !findPredicatedArgument(CS, CallInst1, CallInst2, PredBB1, PredBB2)) { + assert(!CallInst1 && !CallInst2 && "Unexpected new call-sites cloned."); + return false; + } + splitCallSite(CS, PredBB1, PredBB2, CallInst1, CallInst2); + return true; +} + +static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) { + bool Changed = false; + for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) { + BasicBlock &BB = *BI++; + for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) { + Instruction *I = &*II++; + CallSite CS(cast(I)); + if (!CS || isa(I) || isInstructionTriviallyDead(I, &TLI)) + continue; + + Function *Callee = CS.getCalledFunction(); + if (!Callee || Callee->isDeclaration()) + continue; + Changed |= tryToSplitCallSite(CS); + } + } + return Changed; +} + +namespace { +struct CallSiteSplittingLegacyPass : public FunctionPass { + static char ID; + CallSiteSplittingLegacyPass() : FunctionPass(ID) { + initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + FunctionPass::getAnalysisUsage(AU); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + auto &TLI = getAnalysis().getTLI(); + return doCallSiteSplitting(F, TLI); + } +}; +} // namespace + +char CallSiteSplittingLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting", + "Call-site splitting", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting", + "Call-site splitting", false, false) +FunctionPass *llvm::createCallSiteSplittingPass() { + return new CallSiteSplittingLegacyPass(); +} + +PreservedAnalyses CallSiteSplittingPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &TLI = AM.getResult(F); + + if (!doCallSiteSplitting(F, TLI)) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + return PA; +} diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index c1034ace206..8a5ae1b8731 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -35,6 +35,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCELegacyPassPass(Registry); initializeBDCELegacyPassPass(Registry); initializeAlignmentFromAssumptionsPass(Registry); + initializeCallSiteSplittingLegacyPassPass(Registry); initializeConstantHoistingLegacyPassPass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll index 816f75310e3..0810a13c141 100644 --- a/test/Other/new-pm-defaults.ll +++ b/test/Other/new-pm-defaults.ll @@ -76,6 +76,7 @@ ; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass +; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: IPSCCPPass ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass diff --git a/test/Other/new-pm-lto-defaults.ll b/test/Other/new-pm-lto-defaults.ll index fc52f70ff4c..878198d1447 100644 --- a/test/Other/new-pm-lto-defaults.ll +++ b/test/Other/new-pm-lto-defaults.ll @@ -29,9 +29,14 @@ ; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass ; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis +; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> +; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Module +; CHECK-O2-NEXT: Starting llvm::Function pass manager run. +; CHECK-O2-NEXT: Running pass: CallSiteSplittingPass on foo +; CHECK-O2-NEXT: Running analysis: TargetLibraryAnalysis on foo +; CHECK-O2-NEXT: Finished llvm::Function pass manager run. ; CHECK-O2-NEXT: PGOIndirectCallPromotion ; CHECK-O2-NEXT: Running analysis: ProfileSummaryAnalysis -; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Function ; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis ; CHECK-O2-NEXT: Running pass: IPSCCPPass ; CHECK-O2-NEXT: Running pass: CalledValuePropagationPass @@ -42,7 +47,7 @@ ; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}> ; CHECK-O-NEXT: Running analysis: AAManager -; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis +; CHECK-O1-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass ; CHECK-O-NEXT: Running analysis: CallGraphAnalysis ; CHECK-O-NEXT: Running pass: GlobalSplitPass diff --git a/test/Other/new-pm-thinlto-defaults.ll b/test/Other/new-pm-thinlto-defaults.ll index 7d40ef3eea2..e83f0f87055 100644 --- a/test/Other/new-pm-thinlto-defaults.ll +++ b/test/Other/new-pm-thinlto-defaults.ll @@ -72,6 +72,7 @@ ; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass +; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: IPSCCPPass ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass diff --git a/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll new file mode 100644 index 00000000000..d1d854d8f45 --- /dev/null +++ b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll @@ -0,0 +1,339 @@ +; RUN: opt < %s -callsite-splitting -S | FileCheck %s +; RUN: opt < %s -passes='function(callsite-splitting)' -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linaro-linux-gnueabi" + +;CHECK-LABEL: @test_eq_eq +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_eq_eq(i32* %a, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_eq +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_eq(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_ne +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_ne(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp ne i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_eq_eq_untaken +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_eq_eq_untaken(i32* %a, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, null + br i1 %tobool1, label %TBB, label %Tail + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_eq_untaken +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_eq_untaken(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %TBB, label %Tail + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_ne_untaken +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_ne_untaken(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %TBB, label %Tail + +TBB: + %cmp = icmp ne i32 %v, 1 + br i1 %cmp, label %End, label %Tail + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_nonconst_const_phi +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_nonconst_const_phi(i32* %a, i32* %b, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, %b + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_nonconst_nonconst_phi +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 %v, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_nonconst_nonconst_phi(i32* %a, i32* %b, i32 %v, i32 %v2) { +Header: + %tobool1 = icmp eq i32* %a, %b + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, %v2 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_nonconst_nonconst_phi_noncost +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_nonconst_nonconst_phi_noncost(i32* %a, i32* %b, i32 %v, i32 %v2) { +Header: + %tobool1 = icmp eq i32* %a, %b + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, %v2 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[%v,%Header], [%v2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_fisrtnonphi +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_fisrtnonphi(i32* %a, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + store i32 %v, i32* %a + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_3preds_constphi +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_3preds_constphi(i32* %a, i32 %v, i1 %c1, i1 %c2, i1 %c3) { +Header: + br i1 %c1, label %Tail, label %TBB1 + +TBB1: + br i1 %c2, label %Tail, label %TBB2 + +TBB2: + br i1 %c3, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB1], [3, %TBB2] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_indirectbr_phi +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_indirectbr_phi(i8* %address, i32* %a, i32* %b, i32 %v) { +Header: + %indirect.goto.dest = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address + indirectbr i8* %indirect.goto.dest, [label %TBB, label %Tail] + +TBB: + %indirect.goto.dest2 = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address + indirectbr i8* %indirect.goto.dest2, [label %Tail, label %End] + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +define i32 @callee(i32* %a, i32 %v, i32 %p) { +entry: + %c = icmp ne i32* %a, null + br i1 %c, label %BB1, label %BB2 + +BB1: + call void @dummy(i32* %a, i32 %p) + br label %End + +BB2: + call void @dummy2(i32 %v, i32 %p) + br label %End + +End: + ret i32 %p +} + +declare void @dummy(i32*, i32) +declare void @dummy2(i32, i32) diff --git a/test/Transforms/CallSiteSplitting/callsite-split.ll b/test/Transforms/CallSiteSplitting/callsite-split.ll new file mode 100644 index 00000000000..419fa738563 --- /dev/null +++ b/test/Transforms/CallSiteSplitting/callsite-split.ll @@ -0,0 +1,119 @@ +; RUN: opt < %s -callsite-splitting -inline -instcombine -jump-threading -S | FileCheck %s +; RUN: opt < %s -passes='function(callsite-splitting),cgscc(inline),function(instcombine,jump-threading)' -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linaro-linux-gnueabi" + +%struct.bitmap = type { i32, %struct.bitmap* } + +;CHECK-LABEL: @caller +;CHECK-LABEL: NextCond: +;CHECK: br {{.*}} label %callee.exit +;CHECK-LABEL: CallSiteBB.predBB1.split: +;CHECK: call void @callee(%struct.bitmap* null, %struct.bitmap* null, %struct.bitmap* %b_elt, i1 false) +;CHECK-LABEL: callee.exit: +;CHECK: call void @dummy2(%struct.bitmap* %a_elt) + +define void @caller(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt) { +entry: + br label %Top + +Top: + %tobool1 = icmp eq %struct.bitmap* %a_elt, null + br i1 %tobool1, label %CallSiteBB, label %NextCond + +NextCond: + %cmp = icmp ne %struct.bitmap* %b_elt, null + br i1 %cmp, label %CallSiteBB, label %End + +CallSiteBB: + %p = phi i1 [0, %Top], [%c, %NextCond] + call void @callee(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %p) + br label %End + +End: + ret void +} + +define void @callee(%struct.bitmap* %dst_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %c) { +entry: + %tobool = icmp ne %struct.bitmap* %a_elt, null + %tobool1 = icmp ne %struct.bitmap* %b_elt, null + %or.cond = and i1 %tobool, %tobool1 + br i1 %or.cond, label %Cond, label %Big + +Cond: + %cmp = icmp eq %struct.bitmap* %dst_elt, %a_elt + br i1 %cmp, label %Small, label %Big + +Small: + call void @dummy2(%struct.bitmap* %a_elt) + br label %End + +Big: + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + br label %End + +End: + ret void +} + +declare void @dummy2(%struct.bitmap*) +declare void @dummy1(%struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*) + + +;CHECK-LABEL: @caller2 +;CHECK-LABEL: CallSiteBB.predBB1.split: +;CHECK: call void @dummy4() +;CHECK-LABEL: CallSiteBB.predBB2.split: +;CHECK: call void @dummy3() +;CheCK-LABEL: CallSiteBB: +;CHECK: %phi.call = phi i1 [ false, %CallSiteBB.predBB1.split ], [ true, %CallSiteBB.predBB2.split ] +;CHECK: call void @foo(i1 %phi.call) +define void @caller2(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, %struct.bitmap* %c_elt) { +entry: + br label %Top + +Top: + %tobool1 = icmp eq %struct.bitmap* %a_elt, %b_elt + br i1 %tobool1, label %CallSiteBB, label %NextCond + +NextCond: + %cmp = icmp ne %struct.bitmap* %b_elt, %c_elt + br i1 %cmp, label %CallSiteBB, label %End + +CallSiteBB: + %phi = phi i1 [0, %Top],[1, %NextCond] + %u = call i1 @callee2(i1 %phi) + call void @foo(i1 %u) + br label %End + +End: + ret void +} + +define i1 @callee2(i1 %b) { +entry: + br i1 %b, label %BB1, label %BB2 + +BB1: + call void @dummy3() + br label %End + +BB2: + call void @dummy4() + br label %End + +End: + ret i1 %b +} + +declare void @dummy3() +declare void @dummy4() +declare void @foo(i1) -- cgit v1.2.3 From 06dbf5ad00617e8f526e9ed55343fc88fafef6a8 Mon Sep 17 00:00:00 2001 From: Jake Ehrlich Date: Fri, 3 Nov 2017 19:15:06 +0000 Subject: Reland "Add support for writing 64-bit symbol tables for archives when offsets become too large for 32-bit" Tests were failing because some bots were running out of address space and memory. Additionally the test was very slow. These issues were solved by changing the test to take advantage of sparse filse and restricting the test to run only on 64-bit systems. This should fix https://bugs.llvm.org//show_bug.cgi?id=34189 This change makes it so that if writing a K_GNU style archive, you need to output a > 32-bit offset it should output in K_GNU64 style instead. Differential Revision: https://reviews.llvm.org/D36812 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317352 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Object/ArchiveWriter.cpp | 64 +++++++++++++++++++++++++++++++----- test/Object/archive-SYM64-write.test | 35 ++++++++++++++++++++ 2 files changed, 90 insertions(+), 9 deletions(-) create mode 100644 test/Object/archive-SYM64-write.test diff --git a/lib/Object/ArchiveWriter.cpp b/lib/Object/ArchiveWriter.cpp index 919e2676802..63f5082c29d 100644 --- a/lib/Object/ArchiveWriter.cpp +++ b/lib/Object/ArchiveWriter.cpp @@ -122,11 +122,11 @@ static void printWithSpacePadding(raw_ostream &OS, T Data, unsigned Size) { static bool isBSDLike(object::Archive::Kind Kind) { switch (Kind) { case object::Archive::K_GNU: + case object::Archive::K_GNU64: return false; case object::Archive::K_BSD: case object::Archive::K_DARWIN: return true; - case object::Archive::K_GNU64: case object::Archive::K_DARWIN64: case object::Archive::K_COFF: break; @@ -134,8 +134,8 @@ static bool isBSDLike(object::Archive::Kind Kind) { llvm_unreachable("not supported for writting"); } -static void print32(raw_ostream &Out, object::Archive::Kind Kind, - uint32_t Val) { +template +static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val) { if (isBSDLike(Kind)) support::endian::Writer(Out).write(Val); else @@ -216,6 +216,20 @@ static std::string computeRelativePath(StringRef From, StringRef To) { return Relative.str(); } +static bool is64BitKind(object::Archive::Kind Kind) { + switch (Kind) { + case object::Archive::K_GNU: + case object::Archive::K_BSD: + case object::Archive::K_DARWIN: + case object::Archive::K_COFF: + return false; + case object::Archive::K_DARWIN64: + case object::Archive::K_GNU64: + return true; + } + llvm_unreachable("not supported for writting"); +} + static void addToStringTable(raw_ostream &Out, StringRef ArcName, const NewArchiveMember &M, bool Thin) { StringRef ID = M.Buf->getBufferIdentifier(); @@ -288,6 +302,14 @@ static bool isArchiveSymbol(const object::BasicSymbolRef &S) { return true; } +static void printNBits(raw_ostream &Out, object::Archive::Kind Kind, + uint64_t Val) { + if (is64BitKind(Kind)) + print(Out, Kind, Val); + else + print(Out, Kind, Val); +} + static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind, bool Deterministic, ArrayRef Members, StringRef StringTable) { @@ -299,9 +321,11 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind, NumSyms += M.Symbols.size(); unsigned Size = 0; - Size += 4; // Number of entries + Size += is64BitKind(Kind) ? 8 : 4; // Number of entries if (isBSDLike(Kind)) Size += NumSyms * 8; // Table + else if (is64BitKind(Kind)) + Size += NumSyms * 8; // Table else Size += NumSyms * 4; // Table if (isBSDLike(Kind)) @@ -318,27 +342,30 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind, if (isBSDLike(Kind)) printBSDMemberHeader(Out, Out.tell(), "__.SYMDEF", now(Deterministic), 0, 0, 0, Size); + else if (is64BitKind(Kind)) + printGNUSmallMemberHeader(Out, "/SYM64", now(Deterministic), 0, 0, 0, Size); else printGNUSmallMemberHeader(Out, "", now(Deterministic), 0, 0, 0, Size); uint64_t Pos = Out.tell() + Size; if (isBSDLike(Kind)) - print32(Out, Kind, NumSyms * 8); + print(Out, Kind, NumSyms * 8); else - print32(Out, Kind, NumSyms); + printNBits(Out, Kind, NumSyms); for (const MemberData &M : Members) { for (unsigned StringOffset : M.Symbols) { if (isBSDLike(Kind)) - print32(Out, Kind, StringOffset); - print32(Out, Kind, Pos); // member offset + print(Out, Kind, StringOffset); + printNBits(Out, Kind, Pos); // member offset } Pos += M.Header.size() + M.Data.size() + M.Padding.size(); } if (isBSDLike(Kind)) - print32(Out, Kind, StringTable.size()); // byte count of the string table + // byte count of the string table + print(Out, Kind, StringTable.size()); Out << StringTable; while (Pad--) @@ -442,6 +469,25 @@ Error llvm::writeArchive(StringRef ArcName, if (!StringTableBuf.empty()) Data.insert(Data.begin(), computeStringTable(StringTableBuf)); + // We would like to detect if we need to switch to a 64-bit symbol table. + if (WriteSymtab) { + uint64_t MaxOffset = 0; + uint64_t LastOffset = MaxOffset; + for (const auto& M : Data) { + // Record the start of the member's offset + LastOffset = MaxOffset; + // Account for the size of each part associated with the member. + MaxOffset += M.Header.size() + M.Data.size() + M.Padding.size(); + // We assume 32-bit symbols to see if 32-bit symbols are possible or not. + MaxOffset += M.Symbols.size() * 4; + } + // If LastOffset isn't going to fit in a 32-bit varible we need to switch + // to 64-bit. Note that the file can be larger than 4GB as long as the last + // member starts before the 4GB offset. + if (LastOffset >> 32 != 0) + Kind = object::Archive::K_GNU64; + } + SmallString<128> TmpArchive; int TmpArchiveFD; if (auto EC = sys::fs::createUniqueFile(ArcName + ".temp-archive-%%%%%%%.a", diff --git a/test/Object/archive-SYM64-write.test b/test/Object/archive-SYM64-write.test new file mode 100644 index 00000000000..d03b54c58b3 --- /dev/null +++ b/test/Object/archive-SYM64-write.test @@ -0,0 +1,35 @@ +# REQUIRES: llvm-64-bits +# REQUIRES: system-linux + +# RUN: yaml2obj %s > %t +# RUN: dd if=%t of=%t bs=1 count=0 seek=2200M +# RUN: rm -f %t.lib +# RUN: cp %t %t2 +# RUN: llvm-ar cr %t.lib %t %t2 %p/Inputs/trivial-object-test.elf-x86-64 +# RUN: llvm-nm --print-armap %t.lib | FileCheck %s + +!ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .data + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + AddressAlign: 0x0000000000000001 + Content: "00" + Size: 32 + +# CHECK: Archive map +# CHECK-NEXT: main in trivial-object-test.elf-x86-64 + +# CHECK: archive-SYM64-write.test.tmp: + +# CHECK: archive-SYM64-write.test.tmp2: + +# CHECK: trivial-object-test.elf-x86-64: +# CHECK-NEXT: U SomeOtherFunction +# CHECK-NEXT: 0000000000000000 T main +# CHECK-NEXT: U puts -- cgit v1.2.3 From c86c85f907f2513916a2cbd184c8a02d7c64d5a2 Mon Sep 17 00:00:00 2001 From: Jun Bum Lim Date: Fri, 3 Nov 2017 19:17:11 +0000 Subject: Revert "Add CallSiteSplitting pass" Revert due to Buildbot failure. This reverts commit r317351. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317353 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/InitializePasses.h | 1 - include/llvm/Transforms/Scalar.h | 8 - include/llvm/Transforms/Scalar/CallSiteSplitting.h | 29 -- lib/Passes/PassBuilder.cpp | 9 +- lib/Passes/PassRegistry.def | 1 - lib/Transforms/IPO/PassManagerBuilder.cpp | 6 - lib/Transforms/Scalar/CMakeLists.txt | 1 - lib/Transforms/Scalar/CallSiteSplitting.cpp | 492 --------------------- lib/Transforms/Scalar/Scalar.cpp | 1 - test/Other/new-pm-defaults.ll | 1 - test/Other/new-pm-lto-defaults.ll | 9 +- test/Other/new-pm-thinlto-defaults.ll | 1 - .../CallSiteSplitting/callsite-split-or-phi.ll | 339 -------------- .../Transforms/CallSiteSplitting/callsite-split.ll | 119 ----- 14 files changed, 3 insertions(+), 1014 deletions(-) delete mode 100644 include/llvm/Transforms/Scalar/CallSiteSplitting.h delete mode 100644 lib/Transforms/Scalar/CallSiteSplitting.cpp delete mode 100644 test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll delete mode 100644 test/Transforms/CallSiteSplitting/callsite-split.ll diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 9cdb49330ae..b8183d1c8e2 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -80,7 +80,6 @@ void initializeBranchFolderPassPass(PassRegistry&); void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry&); void initializeBranchRelaxationPass(PassRegistry&); void initializeBreakCriticalEdgesPass(PassRegistry&); -void initializeCallSiteSplittingLegacyPassPass(PassRegistry&); void initializeCFGOnlyPrinterLegacyPassPass(PassRegistry&); void initializeCFGOnlyViewerLegacyPassPass(PassRegistry&); void initializeCFGPrinterLegacyPassPass(PassRegistry&); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index 0cf1115dc97..a78c897683f 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -73,14 +73,6 @@ FunctionPass *createDeadCodeEliminationPass(); // FunctionPass *createDeadStoreEliminationPass(); - -//===----------------------------------------------------------------------===// -// -// CallSiteSplitting - This pass split call-site based on its known argument -// values. -FunctionPass *createCallSiteSplittingPass(); - - //===----------------------------------------------------------------------===// // // AggressiveDCE - This pass uses the SSA based Aggressive DCE algorithm. This diff --git a/include/llvm/Transforms/Scalar/CallSiteSplitting.h b/include/llvm/Transforms/Scalar/CallSiteSplitting.h deleted file mode 100644 index 5ab951a49f2..00000000000 --- a/include/llvm/Transforms/Scalar/CallSiteSplitting.h +++ /dev/null @@ -1,29 +0,0 @@ -//===- CallSiteSplitting..h - Callsite Splitting ------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H -#define LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H - -#include "llvm/ADT/SetVector.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/PassManager.h" -#include "llvm/Support/Compiler.h" -#include - -namespace llvm { - -struct CallSiteSplittingPass : PassInfoMixin { - /// \brief Run the pass over the function. - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); -}; -} // end namespace llvm - -#endif // LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index 2088ea0cea2..21d95a07125 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -89,7 +89,6 @@ #include "llvm/Transforms/Scalar/ADCE.h" #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" #include "llvm/Transforms/Scalar/BDCE.h" -#include "llvm/Transforms/Scalar/CallSiteSplitting.h" #include "llvm/Transforms/Scalar/ConstantHoisting.h" #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" #include "llvm/Transforms/Scalar/DCE.h" @@ -549,9 +548,6 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, EarlyFPM.addPass(SROA()); EarlyFPM.addPass(EarlyCSEPass()); EarlyFPM.addPass(LowerExpectIntrinsicPass()); - if (Level == O3) - EarlyFPM.addPass(CallSiteSplittingPass()); - // In SamplePGO ThinLTO backend, we need instcombine before profile annotation // to convert bitcast to direct calls so that they can be inlined during the // profile annotation prepration step. @@ -924,16 +920,13 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, MPM.addPass(InferFunctionAttrsPass()); if (Level > 1) { - FunctionPassManager EarlyFPM(DebugLogging); - EarlyFPM.addPass(CallSiteSplittingPass()); - MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM))); - // Indirect call promotion. This should promote all the targets that are // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should // produce the same result as if we only do promotion here. MPM.addPass(PGOIndirectCallPromotion( true /* InLTO */, PGOOpt && !PGOOpt->SampleProfileFile.empty())); + // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function // pointers passed as arguments to direct uses of functions. diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def index 40b884351fd..20d1220ac33 100644 --- a/lib/Passes/PassRegistry.def +++ b/lib/Passes/PassRegistry.def @@ -140,7 +140,6 @@ FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass()) FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass()) FUNCTION_PASS("bdce", BDCEPass()) FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass()) -FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass()) FUNCTION_PASS("consthoist", ConstantHoistingPass()) FUNCTION_PASS("correlated-propagation", CorrelatedValuePropagationPass()) FUNCTION_PASS("dce", DCEPass()) diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index b8ff614f7c8..828eb5eee29 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -467,9 +467,6 @@ void PassManagerBuilder::populateModulePassManager( addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); - if (OptLevel > 2) - MPM.add(createCallSiteSplittingPass()); - MPM.add(createIPSCCPPass()); // IP SCCP MPM.add(createCalledValuePropagationPass()); MPM.add(createGlobalOptimizerPass()); // Optimize out global vars @@ -706,9 +703,6 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createInferFunctionAttrsLegacyPass()); if (OptLevel > 1) { - // Split call-site with more constrained arguments. - PM.add(createCallSiteSplittingPass()); - // Indirect call promotion. This should promote all the targets that are // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index 6a27fbca8b7..d79ae851005 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -2,7 +2,6 @@ add_llvm_library(LLVMScalarOpts ADCE.cpp AlignmentFromAssumptions.cpp BDCE.cpp - CallSiteSplitting.cpp ConstantHoisting.cpp ConstantProp.cpp CorrelatedValuePropagation.cpp diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp deleted file mode 100644 index 251e3322359..00000000000 --- a/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ /dev/null @@ -1,492 +0,0 @@ -//===- CallSiteSplitting.cpp ----------------------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements a transformation that tries to split a call-site to pass -// more constrained arguments if its argument is predicated in the control flow -// so that we can expose better context to the later passes (e.g, inliner, jump -// threading, or IPA-CP based function cloning, etc.). -// As of now we support two cases : -// -// 1) If a call site is dominated by an OR condition and if any of its arguments -// are predicated on this OR condition, try to split the condition with more -// constrained arguments. For example, in the code below, we try to split the -// call site since we can predicate the argument(ptr) based on the OR condition. -// -// Split from : -// if (!ptr || c) -// callee(ptr); -// to : -// if (!ptr) -// callee(null) // set the known constant value -// else if (c) -// callee(nonnull ptr) // set non-null attribute in the argument -// -// 2) We can also split a call-site based on constant incoming values of a PHI -// For example, -// from : -// Header: -// %c = icmp eq i32 %i1, %i2 -// br i1 %c, label %Tail, label %TBB -// TBB: -// br label Tail% -// Tail: -// %p = phi i32 [ 0, %Header], [ 1, %TBB] -// call void @bar(i32 %p) -// to -// Header: -// %c = icmp eq i32 %i1, %i2 -// br i1 %c, label %Tail-split0, label %TBB -// TBB: -// br label %Tail-split1 -// Tail-split0: -// call void @bar(i32 0) -// br label %Tail -// Tail-split1: -// call void @bar(i32 1) -// br label %Tail -// Tail: -// %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ] -// -//===----------------------------------------------------------------------===// - -#include "llvm/Transforms/Scalar/CallSiteSplitting.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/PatternMatch.h" -#include "llvm/Support/Debug.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" - -using namespace llvm; -using namespace PatternMatch; - -#define DEBUG_TYPE "callsite-splitting" - -STATISTIC(NumCallSiteSplit, "Number of call-site split"); - -static void addNonNullAttribute(Instruction *CallI, Instruction *&NewCallI, - Value *Op) { - if (!NewCallI) - NewCallI = CallI->clone(); - CallSite CS(NewCallI); - unsigned ArgNo = 0; - for (auto &I : CS.args()) { - if (&*I == Op) - CS.addParamAttr(ArgNo, Attribute::NonNull); - ++ArgNo; - } -} - -static void setConstantInArgument(Instruction *CallI, Instruction *&NewCallI, - Value *Op, Constant *ConstValue) { - if (!NewCallI) - NewCallI = CallI->clone(); - CallSite CS(NewCallI); - unsigned ArgNo = 0; - for (auto &I : CS.args()) { - if (&*I == Op) - CS.setArgument(ArgNo, ConstValue); - ++ArgNo; - } -} - -static bool createCallSitesOnOrPredicatedArgument( - CallSite CS, Instruction *&NewCSTakenFromHeader, - Instruction *&NewCSTakenFromNextCond, - SmallVectorImpl &BranchInsts, BasicBlock *HeaderBB) { - assert(BranchInsts.size() <= 2 && - "Unexpected number of blocks in the OR predicated condition"); - Instruction *Instr = CS.getInstruction(); - BasicBlock *CallSiteBB = Instr->getParent(); - TerminatorInst *HeaderTI = HeaderBB->getTerminator(); - bool IsCSInTakenPath = CallSiteBB == HeaderTI->getSuccessor(0); - - for (unsigned I = 0, E = BranchInsts.size(); I != E; ++I) { - BranchInst *PBI = BranchInsts[I]; - assert(isa(PBI->getCondition()) && - "Unexpected condition in a conditional branch."); - ICmpInst *Cmp = cast(PBI->getCondition()); - Value *Arg = Cmp->getOperand(0); - assert(isa(Cmp->getOperand(1)) && - "Expected op1 to be a constant."); - Constant *ConstVal = cast(Cmp->getOperand(1)); - CmpInst::Predicate Pred = Cmp->getPredicate(); - - if (PBI->getParent() == HeaderBB) { - Instruction *&CallTakenFromHeader = - IsCSInTakenPath ? NewCSTakenFromHeader : NewCSTakenFromNextCond; - Instruction *&CallUntakenFromHeader = - IsCSInTakenPath ? NewCSTakenFromNextCond : NewCSTakenFromHeader; - - assert(Pred == ICmpInst::ICMP_EQ || - Pred == ICmpInst::ICMP_NE && - "Unexpected predicate in an OR condition"); - - // Set the constant value for agruments in the call predicated based on - // the OR condition. - Instruction *&CallToSetConst = Pred == ICmpInst::ICMP_EQ - ? CallTakenFromHeader - : CallUntakenFromHeader; - setConstantInArgument(Instr, CallToSetConst, Arg, ConstVal); - - // Add the NonNull attribute if compared with the null pointer. - if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) { - Instruction *&CallToSetAttr = Pred == ICmpInst::ICMP_EQ - ? CallUntakenFromHeader - : CallTakenFromHeader; - addNonNullAttribute(Instr, CallToSetAttr, Arg); - } - continue; - } - - if (Pred == ICmpInst::ICMP_EQ) { - if (PBI->getSuccessor(0) == Instr->getParent()) { - // Set the constant value for the call taken from the second block in - // the OR condition. - setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal); - } else { - // Add the NonNull attribute if compared with the null pointer for the - // call taken from the second block in the OR condition. - if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) - addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg); - } - } else { - if (PBI->getSuccessor(0) == Instr->getParent()) { - // Add the NonNull attribute if compared with the null pointer for the - // call taken from the second block in the OR condition. - if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) - addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg); - } else if (Pred == ICmpInst::ICMP_NE) { - // Set the constant value for the call in the untaken path from the - // header block. - setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal); - } else - llvm_unreachable("Unexpected condition"); - } - } - return NewCSTakenFromHeader || NewCSTakenFromNextCond; -} - -static bool canSplitCallSite(CallSite CS) { - // FIXME: As of now we handle only CallInst. InvokeInst could be handled - // without too much effort. - Instruction *Instr = CS.getInstruction(); - if (!isa(Instr)) - return false; - - // Allow splitting a call-site only when there is no instruction before the - // call-site in the basic block. Based on this constraint, we only clone the - // call instruction, and we do not move a call-site across any other - // instruction. - BasicBlock *CallSiteBB = Instr->getParent(); - if (Instr != CallSiteBB->getFirstNonPHI()) - return false; - - pred_iterator PII = pred_begin(CallSiteBB); - pred_iterator PIE = pred_end(CallSiteBB); - unsigned NumPreds = std::distance(PII, PIE); - - // Allow only one extra call-site. No more than two from one call-site. - if (NumPreds != 2) - return false; - - // Cannot split an edge from an IndirectBrInst. - BasicBlock *Preds[2] = {*PII++, *PII}; - if (isa(Preds[0]->getTerminator()) || - isa(Preds[1]->getTerminator())) - return false; - - return CallSiteBB->canSplitPredecessors(); -} - -/// Return true if the CS is split into its new predecessors which are directly -/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2. -/// Note that PredBB1 and PredBB2 are decided in findPredicatedArgument(), -/// especially for the OR predicated case where PredBB1 will point the header, -/// and PredBB2 will point the the second compare block. CallInst1 and CallInst2 -/// will be the new call-sites placed in the new predecessors split for PredBB1 -/// and PredBB2, repectively. Therefore, CallInst1 will be the call-site placed -/// between Header and Tail, and CallInst2 will be the call-site between TBB and -/// Tail. For example, in the IR below with an OR condition, the call-site can -/// be split -/// -/// from : -/// -/// Header: -/// %c = icmp eq i32* %a, null -/// br i1 %c %Tail, %TBB -/// TBB: -/// %c2 = icmp eq i32* %b, null -/// br i1 %c %Tail, %End -/// Tail: -/// %ca = call i1 @callee (i32* %a, i32* %b) -/// -/// to : -/// -/// Header: // PredBB1 is Header -/// %c = icmp eq i32* %a, null -/// br i1 %c %Tail-split1, %TBB -/// TBB: // PredBB2 is TBB -/// %c2 = icmp eq i32* %b, null -/// br i1 %c %Tail-split2, %End -/// Tail-split1: -/// %ca1 = call @callee (i32* null, i32* %b) // CallInst1 -/// br %Tail -/// Tail-split2: -/// %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2 -/// br %Tail -/// Tail: -/// %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2] -/// -/// Note that for an OR predicated case, CallInst1 and CallInst2 should be -/// created with more constrained arguments in -/// createCallSitesOnOrPredicatedArgument(). -static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2, - Instruction *CallInst1, Instruction *CallInst2) { - Instruction *Instr = CS.getInstruction(); - BasicBlock *TailBB = Instr->getParent(); - assert(Instr == (TailBB->getFirstNonPHI()) && "Unexpected call-site"); - - BasicBlock *SplitBlock1 = - SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split"); - BasicBlock *SplitBlock2 = - SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split"); - - assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split."); - - if (!CallInst1) - CallInst1 = Instr->clone(); - if (!CallInst2) - CallInst2 = Instr->clone(); - - CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt()); - CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt()); - - CallSite CS1(CallInst1); - CallSite CS2(CallInst2); - - // Handle PHIs used as arguments in the call-site. - for (auto &PI : *TailBB) { - PHINode *PN = dyn_cast(&PI); - if (!PN) - break; - unsigned ArgNo = 0; - for (auto &CI : CS.args()) { - if (&*CI == PN) { - CS1.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock1)); - CS2.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock2)); - } - ++ArgNo; - } - } - - // Replace users of the original call with a PHI mering call-sites split. - if (Instr->getNumUses()) { - PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call", Instr); - PN->addIncoming(CallInst1, SplitBlock1); - PN->addIncoming(CallInst2, SplitBlock2); - Instr->replaceAllUsesWith(PN); - } - DEBUG(dbgs() << "split call-site : " << *Instr << " into \n"); - DEBUG(dbgs() << " " << *CallInst1 << " in " << SplitBlock1->getName() - << "\n"); - DEBUG(dbgs() << " " << *CallInst2 << " in " << SplitBlock2->getName() - << "\n"); - Instr->eraseFromParent(); - NumCallSiteSplit++; -} - -static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) { - assert(isa(Cmp->getOperand(1)) && "Expected a constant operand."); - Value *Op0 = Cmp->getOperand(0); - unsigned ArgNo = 0; - for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; - ++I, ++ArgNo) { - // Don't consider constant or arguments that are already known non-null. - if (isa(*I) || CS.paramHasAttr(ArgNo, Attribute::NonNull)) - continue; - - if (*I == Op0) - return true; - } - return false; -} - -static void findOrCondRelevantToCallArgument( - CallSite CS, BasicBlock *PredBB, BasicBlock *OtherPredBB, - SmallVectorImpl &BranchInsts, BasicBlock *&HeaderBB) { - auto *PBI = dyn_cast(PredBB->getTerminator()); - if (!PBI || !PBI->isConditional()) - return; - - if (PBI->getSuccessor(0) == OtherPredBB || - PBI->getSuccessor(1) == OtherPredBB) - if (PredBB == OtherPredBB->getSinglePredecessor()) { - assert(!HeaderBB && "Expect to find only a single header block"); - HeaderBB = PredBB; - } - - CmpInst::Predicate Pred; - Value *Cond = PBI->getCondition(); - if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant()))) - return; - ICmpInst *Cmp = cast(Cond); - if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) - if (isCondRelevantToAnyCallArgument(Cmp, CS)) - BranchInsts.push_back(PBI); -} - -// Return true if the call-site has an argument which is a PHI with only -// constant incoming values. -static bool isPredicatedOnPHI(CallSite CS) { - Instruction *Instr = CS.getInstruction(); - BasicBlock *Parent = Instr->getParent(); - if (Instr != Parent->getFirstNonPHI()) - return false; - - for (auto &BI : *Parent) { - if (PHINode *PN = dyn_cast(&BI)) { - for (auto &I : CS.args()) - if (&*I == PN) { - assert(PN->getNumIncomingValues() == 2 && - "Unexpected number of incoming values"); - if (PN->getIncomingBlock(0) == PN->getIncomingBlock(1)) - return false; - if (PN->getIncomingValue(0) == PN->getIncomingValue(1)) - continue; - if (isa(PN->getIncomingValue(0)) && - isa(PN->getIncomingValue(1))) - return true; - } - } - break; - } - return false; -} - -// Return true if an agument in CS is predicated on an 'or' condition. -// Create new call-site with arguments constrained based on the OR condition. -static bool findPredicatedOnOrCondition(CallSite CS, BasicBlock *PredBB1, - BasicBlock *PredBB2, - Instruction *&NewCallTakenFromHeader, - Instruction *&NewCallTakenFromNextCond, - BasicBlock *&HeaderBB) { - SmallVector BranchInsts; - findOrCondRelevantToCallArgument(CS, PredBB1, PredBB2, BranchInsts, HeaderBB); - findOrCondRelevantToCallArgument(CS, PredBB2, PredBB1, BranchInsts, HeaderBB); - if (BranchInsts.empty() || !HeaderBB) - return false; - - // If an OR condition is detected, try to create call sites with constrained - // arguments (e.g., NonNull attribute or constant value). - return createCallSitesOnOrPredicatedArgument(CS, NewCallTakenFromHeader, - NewCallTakenFromNextCond, - BranchInsts, HeaderBB); -} - -static bool findPredicatedArgument(CallSite CS, Instruction *&CallInst1, - Instruction *&CallInst2, - BasicBlock *&PredBB1, BasicBlock *&PredBB2) { - BasicBlock *CallSiteBB = CS.getInstruction()->getParent(); - pred_iterator PII = pred_begin(CallSiteBB); - pred_iterator PIE = pred_end(CallSiteBB); - assert(std::distance(PII, PIE) == 2 && "Expect only two predecessors."); - BasicBlock *Preds[2] = {*PII++, *PII}; - BasicBlock *&HeaderBB = PredBB1; - if (!findPredicatedOnOrCondition(CS, Preds[0], Preds[1], CallInst1, CallInst2, - HeaderBB) && - !isPredicatedOnPHI(CS)) - return false; - - if (!PredBB1) - PredBB1 = Preds[0]; - - PredBB2 = PredBB1 == Preds[0] ? Preds[1] : Preds[0]; - return true; -} - -static bool tryToSplitCallSite(CallSite CS) { - if (!CS.arg_size()) - return false; - - BasicBlock *PredBB1 = nullptr; - BasicBlock *PredBB2 = nullptr; - Instruction *CallInst1 = nullptr; - Instruction *CallInst2 = nullptr; - if (!canSplitCallSite(CS) || - !findPredicatedArgument(CS, CallInst1, CallInst2, PredBB1, PredBB2)) { - assert(!CallInst1 && !CallInst2 && "Unexpected new call-sites cloned."); - return false; - } - splitCallSite(CS, PredBB1, PredBB2, CallInst1, CallInst2); - return true; -} - -static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) { - bool Changed = false; - for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) { - BasicBlock &BB = *BI++; - for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) { - Instruction *I = &*II++; - CallSite CS(cast(I)); - if (!CS || isa(I) || isInstructionTriviallyDead(I, &TLI)) - continue; - - Function *Callee = CS.getCalledFunction(); - if (!Callee || Callee->isDeclaration()) - continue; - Changed |= tryToSplitCallSite(CS); - } - } - return Changed; -} - -namespace { -struct CallSiteSplittingLegacyPass : public FunctionPass { - static char ID; - CallSiteSplittingLegacyPass() : FunctionPass(ID) { - initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - FunctionPass::getAnalysisUsage(AU); - } - - bool runOnFunction(Function &F) override { - if (skipFunction(F)) - return false; - - auto &TLI = getAnalysis().getTLI(); - return doCallSiteSplitting(F, TLI); - } -}; -} // namespace - -char CallSiteSplittingLegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting", - "Call-site splitting", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting", - "Call-site splitting", false, false) -FunctionPass *llvm::createCallSiteSplittingPass() { - return new CallSiteSplittingLegacyPass(); -} - -PreservedAnalyses CallSiteSplittingPass::run(Function &F, - FunctionAnalysisManager &AM) { - auto &TLI = AM.getResult(F); - - if (!doCallSiteSplitting(F, TLI)) - return PreservedAnalyses::all(); - PreservedAnalyses PA; - return PA; -} diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 8a5ae1b8731..c1034ace206 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -35,7 +35,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCELegacyPassPass(Registry); initializeBDCELegacyPassPass(Registry); initializeAlignmentFromAssumptionsPass(Registry); - initializeCallSiteSplittingLegacyPassPass(Registry); initializeConstantHoistingLegacyPassPass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll index 0810a13c141..816f75310e3 100644 --- a/test/Other/new-pm-defaults.ll +++ b/test/Other/new-pm-defaults.ll @@ -76,7 +76,6 @@ ; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass -; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: IPSCCPPass ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass diff --git a/test/Other/new-pm-lto-defaults.ll b/test/Other/new-pm-lto-defaults.ll index 878198d1447..fc52f70ff4c 100644 --- a/test/Other/new-pm-lto-defaults.ll +++ b/test/Other/new-pm-lto-defaults.ll @@ -29,14 +29,9 @@ ; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass ; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis -; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> -; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Module -; CHECK-O2-NEXT: Starting llvm::Function pass manager run. -; CHECK-O2-NEXT: Running pass: CallSiteSplittingPass on foo -; CHECK-O2-NEXT: Running analysis: TargetLibraryAnalysis on foo -; CHECK-O2-NEXT: Finished llvm::Function pass manager run. ; CHECK-O2-NEXT: PGOIndirectCallPromotion ; CHECK-O2-NEXT: Running analysis: ProfileSummaryAnalysis +; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Function ; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis ; CHECK-O2-NEXT: Running pass: IPSCCPPass ; CHECK-O2-NEXT: Running pass: CalledValuePropagationPass @@ -47,7 +42,7 @@ ; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}> ; CHECK-O-NEXT: Running analysis: AAManager -; CHECK-O1-NEXT: Running analysis: TargetLibraryAnalysis +; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass ; CHECK-O-NEXT: Running analysis: CallGraphAnalysis ; CHECK-O-NEXT: Running pass: GlobalSplitPass diff --git a/test/Other/new-pm-thinlto-defaults.ll b/test/Other/new-pm-thinlto-defaults.ll index e83f0f87055..7d40ef3eea2 100644 --- a/test/Other/new-pm-thinlto-defaults.ll +++ b/test/Other/new-pm-thinlto-defaults.ll @@ -72,7 +72,6 @@ ; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass -; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: IPSCCPPass ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass diff --git a/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll deleted file mode 100644 index d1d854d8f45..00000000000 --- a/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll +++ /dev/null @@ -1,339 +0,0 @@ -; RUN: opt < %s -callsite-splitting -S | FileCheck %s -; RUN: opt < %s -passes='function(callsite-splitting)' -S | FileCheck %s - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-linaro-linux-gnueabi" - -;CHECK-LABEL: @test_eq_eq -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_eq_eq(i32* %a, i32 %v) { -Header: - %tobool1 = icmp eq i32* %a, null - br i1 %tobool1, label %Tail, label %TBB - -TBB: - %cmp = icmp eq i32 %v, 1 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_ne_eq -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_ne_eq(i32* %a, i32 %v) { -Header: - %tobool1 = icmp ne i32* %a, null - br i1 %tobool1, label %Tail, label %TBB - -TBB: - %cmp = icmp eq i32 %v, 1 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_ne_ne -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_ne_ne(i32* %a, i32 %v) { -Header: - %tobool1 = icmp ne i32* %a, null - br i1 %tobool1, label %Tail, label %TBB - -TBB: - %cmp = icmp ne i32 %v, 1 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_eq_eq_untaken -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_eq_eq_untaken(i32* %a, i32 %v) { -Header: - %tobool1 = icmp eq i32* %a, null - br i1 %tobool1, label %TBB, label %Tail - -TBB: - %cmp = icmp eq i32 %v, 1 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_ne_eq_untaken -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_ne_eq_untaken(i32* %a, i32 %v) { -Header: - %tobool1 = icmp ne i32* %a, null - br i1 %tobool1, label %TBB, label %Tail - -TBB: - %cmp = icmp eq i32 %v, 1 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_ne_ne_untaken -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_ne_ne_untaken(i32* %a, i32 %v) { -Header: - %tobool1 = icmp ne i32* %a, null - br i1 %tobool1, label %TBB, label %Tail - -TBB: - %cmp = icmp ne i32 %v, 1 - br i1 %cmp, label %End, label %Tail - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_nonconst_const_phi -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_nonconst_const_phi(i32* %a, i32* %b, i32 %v) { -Header: - %tobool1 = icmp eq i32* %a, %b - br i1 %tobool1, label %Tail, label %TBB - -TBB: - %cmp = icmp eq i32 %v, 1 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_nonconst_nonconst_phi -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 %v, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_nonconst_nonconst_phi(i32* %a, i32* %b, i32 %v, i32 %v2) { -Header: - %tobool1 = icmp eq i32* %a, %b - br i1 %tobool1, label %Tail, label %TBB - -TBB: - %cmp = icmp eq i32 %v, %v2 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_nonconst_nonconst_phi_noncost -;CHECK-NOT: Tail.predBB1.split: -;CHECK-NOT: Tail.predBB2.split: -;CHECK-LABEL: Tail: -;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) -;CHECK: ret i32 %r -define i32 @test_nonconst_nonconst_phi_noncost(i32* %a, i32* %b, i32 %v, i32 %v2) { -Header: - %tobool1 = icmp eq i32* %a, %b - br i1 %tobool1, label %Tail, label %TBB - -TBB: - %cmp = icmp eq i32 %v, %v2 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[%v,%Header], [%v2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_fisrtnonphi -;CHECK-NOT: Tail.predBB1.split: -;CHECK-NOT: Tail.predBB2.split: -;CHECK-LABEL: Tail: -;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) -;CHECK: ret i32 %r -define i32 @test_fisrtnonphi(i32* %a, i32 %v) { -Header: - %tobool1 = icmp eq i32* %a, null - br i1 %tobool1, label %Tail, label %TBB - -TBB: - %cmp = icmp eq i32 %v, 1 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - store i32 %v, i32* %a - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_3preds_constphi -;CHECK-NOT: Tail.predBB1.split: -;CHECK-NOT: Tail.predBB2.split: -;CHECK-LABEL: Tail: -;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) -;CHECK: ret i32 %r -define i32 @test_3preds_constphi(i32* %a, i32 %v, i1 %c1, i1 %c2, i1 %c3) { -Header: - br i1 %c1, label %Tail, label %TBB1 - -TBB1: - br i1 %c2, label %Tail, label %TBB2 - -TBB2: - br i1 %c3, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB1], [3, %TBB2] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_indirectbr_phi -;CHECK-NOT: Tail.predBB1.split: -;CHECK-NOT: Tail.predBB2.split: -;CHECK-LABEL: Tail: -;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) -;CHECK: ret i32 %r -define i32 @test_indirectbr_phi(i8* %address, i32* %a, i32* %b, i32 %v) { -Header: - %indirect.goto.dest = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address - indirectbr i8* %indirect.goto.dest, [label %TBB, label %Tail] - -TBB: - %indirect.goto.dest2 = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address - indirectbr i8* %indirect.goto.dest2, [label %Tail, label %End] - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -define i32 @callee(i32* %a, i32 %v, i32 %p) { -entry: - %c = icmp ne i32* %a, null - br i1 %c, label %BB1, label %BB2 - -BB1: - call void @dummy(i32* %a, i32 %p) - br label %End - -BB2: - call void @dummy2(i32 %v, i32 %p) - br label %End - -End: - ret i32 %p -} - -declare void @dummy(i32*, i32) -declare void @dummy2(i32, i32) diff --git a/test/Transforms/CallSiteSplitting/callsite-split.ll b/test/Transforms/CallSiteSplitting/callsite-split.ll deleted file mode 100644 index 419fa738563..00000000000 --- a/test/Transforms/CallSiteSplitting/callsite-split.ll +++ /dev/null @@ -1,119 +0,0 @@ -; RUN: opt < %s -callsite-splitting -inline -instcombine -jump-threading -S | FileCheck %s -; RUN: opt < %s -passes='function(callsite-splitting),cgscc(inline),function(instcombine,jump-threading)' -S | FileCheck %s - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-linaro-linux-gnueabi" - -%struct.bitmap = type { i32, %struct.bitmap* } - -;CHECK-LABEL: @caller -;CHECK-LABEL: NextCond: -;CHECK: br {{.*}} label %callee.exit -;CHECK-LABEL: CallSiteBB.predBB1.split: -;CHECK: call void @callee(%struct.bitmap* null, %struct.bitmap* null, %struct.bitmap* %b_elt, i1 false) -;CHECK-LABEL: callee.exit: -;CHECK: call void @dummy2(%struct.bitmap* %a_elt) - -define void @caller(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt) { -entry: - br label %Top - -Top: - %tobool1 = icmp eq %struct.bitmap* %a_elt, null - br i1 %tobool1, label %CallSiteBB, label %NextCond - -NextCond: - %cmp = icmp ne %struct.bitmap* %b_elt, null - br i1 %cmp, label %CallSiteBB, label %End - -CallSiteBB: - %p = phi i1 [0, %Top], [%c, %NextCond] - call void @callee(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %p) - br label %End - -End: - ret void -} - -define void @callee(%struct.bitmap* %dst_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %c) { -entry: - %tobool = icmp ne %struct.bitmap* %a_elt, null - %tobool1 = icmp ne %struct.bitmap* %b_elt, null - %or.cond = and i1 %tobool, %tobool1 - br i1 %or.cond, label %Cond, label %Big - -Cond: - %cmp = icmp eq %struct.bitmap* %dst_elt, %a_elt - br i1 %cmp, label %Small, label %Big - -Small: - call void @dummy2(%struct.bitmap* %a_elt) - br label %End - -Big: - call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) - call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) - call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) - call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) - call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) - call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) - call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) - br label %End - -End: - ret void -} - -declare void @dummy2(%struct.bitmap*) -declare void @dummy1(%struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*) - - -;CHECK-LABEL: @caller2 -;CHECK-LABEL: CallSiteBB.predBB1.split: -;CHECK: call void @dummy4() -;CHECK-LABEL: CallSiteBB.predBB2.split: -;CHECK: call void @dummy3() -;CheCK-LABEL: CallSiteBB: -;CHECK: %phi.call = phi i1 [ false, %CallSiteBB.predBB1.split ], [ true, %CallSiteBB.predBB2.split ] -;CHECK: call void @foo(i1 %phi.call) -define void @caller2(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, %struct.bitmap* %c_elt) { -entry: - br label %Top - -Top: - %tobool1 = icmp eq %struct.bitmap* %a_elt, %b_elt - br i1 %tobool1, label %CallSiteBB, label %NextCond - -NextCond: - %cmp = icmp ne %struct.bitmap* %b_elt, %c_elt - br i1 %cmp, label %CallSiteBB, label %End - -CallSiteBB: - %phi = phi i1 [0, %Top],[1, %NextCond] - %u = call i1 @callee2(i1 %phi) - call void @foo(i1 %u) - br label %End - -End: - ret void -} - -define i1 @callee2(i1 %b) { -entry: - br i1 %b, label %BB1, label %BB2 - -BB1: - call void @dummy3() - br label %End - -BB2: - call void @dummy4() - br label %End - -End: - ret i1 %b -} - -declare void @dummy3() -declare void @dummy4() -declare void @foo(i1) -- cgit v1.2.3 From b24883f402ddb788a35189d091f5fb5286dc74f7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Nov 2017 19:37:41 +0000 Subject: [X86] Promote athlon, athlon-xp, k8, and k8-sse3 to types instead of subtypes in getHostCPUName. NFCI This removes the athlon type and simplifies the string decoding. We only really need these type/subtype breaks where we need to match libgcc/compiler-rt and these CPUs aren't part of that. I'm looking into moving some of this information to a .def file to share with clang's __builtin_cpu_is handling. And while these CPUs aren't part of that the less lines I have to deal with in the .def file the better. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317354 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/Host.cpp | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index 40ed87bf40d..5b2a0f1d0c2 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -355,7 +355,10 @@ enum ProcessorTypes { INTEL_PRESCOTT, AMD_i486, AMDPENTIUM, - AMDATHLON, + AMD_ATHLON, + AMD_ATHLON_XP, + AMD_K8, + AMD_K8SSE3, INTEL_GOLDMONT, CPU_TYPE_MAX }; @@ -384,10 +387,6 @@ enum ProcessorSubtypes { AMDPENTIUM_K62, AMDPENTIUM_K63, AMDPENTIUM_GEODE, - AMDATHLON_CLASSIC, - AMDATHLON_XP, - AMDATHLON_K8, - AMDATHLON_K8SSE3, CPU_SUBTYPE_MAX }; @@ -864,20 +863,18 @@ static void getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model, } break; case 6: - *Type = AMDATHLON; if (Features & (1 << FEATURE_SSE)) { - *Subtype = AMDATHLON_XP; + *Type = AMD_ATHLON_XP; break; // "athlon-xp" } - *Subtype = AMDATHLON_CLASSIC; + *Type = AMD_ATHLON; break; // "athlon" case 15: - *Type = AMDATHLON; if (Features & (1 << FEATURE_SSE3)) { - *Subtype = AMDATHLON_K8SSE3; + *Type = AMD_K8SSE3; break; // "k8-sse3" } - *Subtype = AMDATHLON_K8; + *Type = AMD_K8; break; // "k8" case 16: *Type = AMDFAM10H; // "amdfam10" @@ -1149,19 +1146,14 @@ StringRef sys::getHostCPUName() { default: return "pentium"; } - case AMDATHLON: - switch (Subtype) { - case AMDATHLON_CLASSIC: - return "athlon"; - case AMDATHLON_XP: - return "athlon-xp"; - case AMDATHLON_K8: - return "k8"; - case AMDATHLON_K8SSE3: - return "k8-sse3"; - default: - llvm_unreachable("Unexpected subtype!"); - } + case AMD_ATHLON: + return "athlon"; + case AMD_ATHLON_XP: + return "athlon-xp"; + case AMD_K8: + return "k8"; + case AMD_K8SSE3: + return "k8-sse3"; case AMDFAM10H: return "amdfam10"; case AMD_BTVER1: -- cgit v1.2.3 From b72a3a9da434080da25914c9eed94416b1adee40 Mon Sep 17 00:00:00 2001 From: Mitch Phillips Date: Fri, 3 Nov 2017 20:00:05 +0000 Subject: [cfi-verify] Add an interesting unit test where undef search length changes result. Add an interesting unit test, found by changing --search-length-undef from the default. Program handles it correctly but good for ensuring correctness on further changes :) Reviewers: pcc Subscribers: mgorny, llvm-commits, kcc, vlad.tsyrklevich Differential Revision: https://reviews.llvm.org/D38658 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317355 91177308-0d34-0410-b5e6-96231b3b80d8 --- unittests/tools/llvm-cfi-verify/FileAnalysis.cpp | 53 ++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp b/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp index 0df468e8995..a3da1fc3f56 100644 --- a/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp +++ b/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp @@ -650,7 +650,60 @@ TEST_F(BasicFileAnalysisTest, CFIProtectionComplexExample) { 0x0f, 0x0b, // 22: ud2 }, 0xDEADBEEF); + uint64_t PrevSearchLengthForUndef = SearchLengthForUndef; + SearchLengthForUndef = 5; EXPECT_FALSE(Analysis.isIndirectInstructionCFIProtected(0xDEADBEEF + 9)); + SearchLengthForUndef = PrevSearchLengthForUndef; +} + +TEST_F(BasicFileAnalysisTest, UndefSearchLengthOneTest) { + Analysis.parseSectionContents( + { + 0x77, 0x0d, // 0x688118: ja 0x688127 [+12] + 0x48, 0x89, 0xdf, // 0x68811a: mov %rbx, %rdi + 0xff, 0xd0, // 0x68811d: callq *%rax + 0x48, 0x89, 0xdf, // 0x68811f: mov %rbx, %rdi + 0xe8, 0x09, 0x00, 0x00, 0x00, // 0x688122: callq 0x688130 + 0x0f, 0x0b, // 0x688127: ud2 + }, + 0x688118); + uint64_t PrevSearchLengthForUndef = SearchLengthForUndef; + SearchLengthForUndef = 1; + EXPECT_TRUE(Analysis.isIndirectInstructionCFIProtected(0x68811d)); + SearchLengthForUndef = PrevSearchLengthForUndef; +} + +TEST_F(BasicFileAnalysisTest, UndefSearchLengthOneTestFarAway) { + Analysis.parseSectionContents( + { + 0x74, 0x73, // 0x7759eb: je 0x775a60 + 0xe9, 0x1c, 0x04, 0x00, 0x00, 0x00, // 0x7759ed: jmpq 0x775e0e + }, + 0x7759eb); + + Analysis.parseSectionContents( + { + 0x0f, 0x85, 0xb2, 0x03, 0x00, 0x00, // 0x775a56: jne 0x775e0e + 0x48, 0x83, 0xc3, 0xf4, // 0x775a5c: add $0xfffffffffffffff4,%rbx + 0x48, 0x8b, 0x7c, 0x24, 0x10, // 0x775a60: mov 0x10(%rsp),%rdi + 0x48, 0x89, 0xde, // 0x775a65: mov %rbx,%rsi + 0xff, 0xd1, // 0x775a68: callq *%rcx + }, + 0x775a56); + + Analysis.parseSectionContents( + { + 0x0f, 0x0b, // 0x775e0e: ud2 + }, + 0x775e0e); + uint64_t PrevSearchLengthForUndef = SearchLengthForUndef; + SearchLengthForUndef = 1; + EXPECT_FALSE(Analysis.isIndirectInstructionCFIProtected(0x775a68)); + SearchLengthForUndef = 2; + EXPECT_TRUE(Analysis.isIndirectInstructionCFIProtected(0x775a68)); + SearchLengthForUndef = 3; + EXPECT_TRUE(Analysis.isIndirectInstructionCFIProtected(0x775a68)); + SearchLengthForUndef = PrevSearchLengthForUndef; } } // anonymous namespace -- cgit v1.2.3 From bdc30c02fb2f7dceab4499c871fc00aa9b7543b9 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Fri, 3 Nov 2017 20:01:25 +0000 Subject: Add llvm::for_each as a range-based extensions to and make use of it in some cases where it is a more clear alternative to std::for_each. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317356 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ADT/STLExtras.h | 7 ++++++ lib/LTO/LTOCodeGenerator.cpp | 20 ++++++++--------- .../Hexagon/HexagonVectorLoopCarriedReuse.cpp | 15 ++++++------- lib/Transforms/Utils/SplitModule.cpp | 18 ++++++++-------- tools/llvm-cxxdump/llvm-cxxdump.cpp | 15 ++++++------- tools/llvm-mcmarkup/llvm-mcmarkup.cpp | 13 ++++++----- tools/llvm-nm/llvm-nm.cpp | 3 +-- tools/llvm-objdump/llvm-objdump.cpp | 15 ++++++------- tools/llvm-pdbutil/llvm-pdbutil.cpp | 25 ++++++++++------------ tools/llvm-readobj/llvm-readobj.cpp | 15 ++++++------- tools/llvm-size/llvm-size.cpp | 15 ++++++------- unittests/ADT/STLExtrasTest.cpp | 20 +++++++++++------ 12 files changed, 92 insertions(+), 89 deletions(-) diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h index 3ec9dfe5de0..c42d976f467 100644 --- a/include/llvm/ADT/STLExtras.h +++ b/include/llvm/ADT/STLExtras.h @@ -813,6 +813,13 @@ void DeleteContainerSeconds(Container &C) { C.clear(); } +/// Provide wrappers to std::for_each which take ranges instead of having to +/// pass begin/end explicitly. +template +UnaryPredicate for_each(R &&Range, UnaryPredicate P) { + return std::for_each(std::begin(Range), std::end(Range), P); +} + /// Provide wrappers to std::all_of which take ranges instead of having to pass /// begin/end explicitly. template diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp index 9759c0c6c1d..87867c54fad 100644 --- a/lib/LTO/LTOCodeGenerator.cpp +++ b/lib/LTO/LTOCodeGenerator.cpp @@ -469,17 +469,15 @@ void LTOCodeGenerator::restoreLinkageForExternals() { if (I == ExternalSymbols.end()) return; - GV.setLinkage(I->second); - }; - - std::for_each(MergedModule->begin(), MergedModule->end(), externalize); - std::for_each(MergedModule->global_begin(), MergedModule->global_end(), - externalize); - std::for_each(MergedModule->alias_begin(), MergedModule->alias_end(), - externalize); -} - -void LTOCodeGenerator::verifyMergedModuleOnce() { + GV.setLinkage(I->second); + }; + + llvm::for_each(MergedModule->functions(), externalize); + llvm::for_each(MergedModule->globals(), externalize); + llvm::for_each(MergedModule->aliases(), externalize); +} + +void LTOCodeGenerator::verifyMergedModuleOnce() { // Only run on the first call. if (HasVerifiedInput) return; diff --git a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp index a0fdc70e141..52e5dcd4638 100644 --- a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp +++ b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp @@ -548,14 +548,13 @@ bool HexagonVectorLoopCarriedReuse::doVLCR() { findValueToReuse(); if (ReuseCandidate.isDefined()) { reuseValue(); - Changed = true; - Continue = true; - } - std::for_each(Dependences.begin(), Dependences.end(), - std::default_delete()); - } while (Continue); - return Changed; -} + Changed = true; + Continue = true; + } + llvm::for_each(Dependences, std::default_delete()); + } while (Continue); + return Changed; +} void HexagonVectorLoopCarriedReuse::findDepChainFromPHI(Instruction *I, DepChain &D) { diff --git a/lib/Transforms/Utils/SplitModule.cpp b/lib/Transforms/Utils/SplitModule.cpp index 07157069518..934a1bd73c2 100644 --- a/lib/Transforms/Utils/SplitModule.cpp +++ b/lib/Transforms/Utils/SplitModule.cpp @@ -141,15 +141,15 @@ static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap, } if (GV.hasLocalLinkage()) - addAllGlobalValueUsers(GVtoClusterMap, &GV, &GV); - }; - - std::for_each(M->begin(), M->end(), recordGVSet); - std::for_each(M->global_begin(), M->global_end(), recordGVSet); - std::for_each(M->alias_begin(), M->alias_end(), recordGVSet); - - // Assigned all GVs to merged clusters while balancing number of objects in - // each. + addAllGlobalValueUsers(GVtoClusterMap, &GV, &GV); + }; + + llvm::for_each(M->functions(), recordGVSet); + llvm::for_each(M->globals(), recordGVSet); + llvm::for_each(M->aliases(), recordGVSet); + + // Assigned all GVs to merged clusters while balancing number of objects in + // each. auto CompareClusters = [](const std::pair &a, const std::pair &b) { if (a.second || b.second) diff --git a/tools/llvm-cxxdump/llvm-cxxdump.cpp b/tools/llvm-cxxdump/llvm-cxxdump.cpp index b10759ad05c..69b1a8ef209 100644 --- a/tools/llvm-cxxdump/llvm-cxxdump.cpp +++ b/tools/llvm-cxxdump/llvm-cxxdump.cpp @@ -546,11 +546,10 @@ int main(int argc, const char *argv[]) { cl::ParseCommandLineOptions(argc, argv, "LLVM C++ ABI Data Dumper\n"); // Default to stdin if no filename is specified. - if (opts::InputFilenames.size() == 0) - opts::InputFilenames.push_back("-"); - - std::for_each(opts::InputFilenames.begin(), opts::InputFilenames.end(), - dumpInput); - - return EXIT_SUCCESS; -} + if (opts::InputFilenames.size() == 0) + opts::InputFilenames.push_back("-"); + + llvm::for_each(opts::InputFilenames, dumpInput); + + return EXIT_SUCCESS; +} diff --git a/tools/llvm-mcmarkup/llvm-mcmarkup.cpp b/tools/llvm-mcmarkup/llvm-mcmarkup.cpp index 0be3c715eee..db57a6bdaa8 100644 --- a/tools/llvm-mcmarkup/llvm-mcmarkup.cpp +++ b/tools/llvm-mcmarkup/llvm-mcmarkup.cpp @@ -217,10 +217,9 @@ int main(int argc, char **argv) { ToolName = argv[0]; // If no input files specified, read from stdin. - if (InputFilenames.size() == 0) - InputFilenames.push_back("-"); - - std::for_each(InputFilenames.begin(), InputFilenames.end(), - parseMCMarkup); - return 0; -} + if (InputFilenames.size() == 0) + InputFilenames.push_back("-"); + + llvm::for_each(InputFilenames, parseMCMarkup); + return 0; +} diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp index 85204300284..d2909644628 100644 --- a/tools/llvm-nm/llvm-nm.cpp +++ b/tools/llvm-nm/llvm-nm.cpp @@ -1977,8 +1977,7 @@ int main(int argc, char **argv) { if (NoDyldInfo && (AddDyldInfo || DyldInfoOnly)) error("-no-dyldinfo can't be used with -add-dyldinfo or -dyldinfo-only"); - std::for_each(InputFilenames.begin(), InputFilenames.end(), - dumpSymbolNamesFromFile); + llvm::for_each(InputFilenames, dumpSymbolNamesFromFile); if (HadError) return 1; diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp index 09396466c40..d80f1cb049d 100644 --- a/tools/llvm-objdump/llvm-objdump.cpp +++ b/tools/llvm-objdump/llvm-objdump.cpp @@ -2183,11 +2183,10 @@ int main(int argc, char **argv) { && !PrintFaultMaps && DwarfDumpType == DIDT_Null) { cl::PrintHelpMessage(); - return 2; - } - - std::for_each(InputFilenames.begin(), InputFilenames.end(), - DumpInput); - - return EXIT_SUCCESS; -} + return 2; + } + + llvm::for_each(InputFilenames, DumpInput); + + return EXIT_SUCCESS; +} diff --git a/tools/llvm-pdbutil/llvm-pdbutil.cpp b/tools/llvm-pdbutil/llvm-pdbutil.cpp index 8b2d5ce179f..bee9f182e3f 100644 --- a/tools/llvm-pdbutil/llvm-pdbutil.cpp +++ b/tools/llvm-pdbutil/llvm-pdbutil.cpp @@ -1199,20 +1199,17 @@ int main(int argc_, const char *argv_[]) { opts::pretty::ExcludeCompilands.push_back( "f:\\\\binaries\\\\Intermediate\\\\vctools\\\\crt_bld"); opts::pretty::ExcludeCompilands.push_back("f:\\\\dd\\\\vctools\\\\crt"); - opts::pretty::ExcludeCompilands.push_back( - "d:\\\\th.obj.x86fre\\\\minkernel"); - } - std::for_each(opts::pretty::InputFilenames.begin(), - opts::pretty::InputFilenames.end(), dumpPretty); - } else if (opts::DumpSubcommand) { - std::for_each(opts::dump::InputFilenames.begin(), - opts::dump::InputFilenames.end(), dumpRaw); - } else if (opts::BytesSubcommand) { - std::for_each(opts::bytes::InputFilenames.begin(), - opts::bytes::InputFilenames.end(), dumpBytes); - } else if (opts::DiffSubcommand) { - for (StringRef S : opts::diff::RawModiEquivalences) { - StringRef Left; + opts::pretty::ExcludeCompilands.push_back( + "d:\\\\th.obj.x86fre\\\\minkernel"); + } + llvm::for_each(opts::pretty::InputFilenames, dumpPretty); + } else if (opts::DumpSubcommand) { + llvm::for_each(opts::dump::InputFilenames, dumpRaw); + } else if (opts::BytesSubcommand) { + llvm::for_each(opts::bytes::InputFilenames, dumpBytes); + } else if (opts::DiffSubcommand) { + for (StringRef S : opts::diff::RawModiEquivalences) { + StringRef Left; StringRef Right; std::tie(Left, Right) = S.split(','); uint32_t X, Y; diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp index 05b7c800cc1..851988110ea 100644 --- a/tools/llvm-readobj/llvm-readobj.cpp +++ b/tools/llvm-readobj/llvm-readobj.cpp @@ -566,14 +566,13 @@ int main(int argc, const char *argv[]) { cl::ParseCommandLineOptions(argc, argv, "LLVM Object Reader\n"); // Default to stdin if no filename is specified. - if (opts::InputFilenames.size() == 0) - opts::InputFilenames.push_back("-"); - - std::for_each(opts::InputFilenames.begin(), opts::InputFilenames.end(), - dumpInput); - - if (opts::CodeViewMergedTypes) { - ScopedPrinter W(outs()); + if (opts::InputFilenames.size() == 0) + opts::InputFilenames.push_back("-"); + + llvm::for_each(opts::InputFilenames, dumpInput); + + if (opts::CodeViewMergedTypes) { + ScopedPrinter W(outs()); dumpCodeViewMergedTypes(W, CVTypes.IDTable, CVTypes.TypeTable); } diff --git a/tools/llvm-size/llvm-size.cpp b/tools/llvm-size/llvm-size.cpp index bdb118a264e..7a8e744d2e6 100644 --- a/tools/llvm-size/llvm-size.cpp +++ b/tools/llvm-size/llvm-size.cpp @@ -880,14 +880,13 @@ int main(int argc, char **argv) { } if (InputFilenames.size() == 0) - InputFilenames.push_back("a.out"); - - MoreThanOneFile = InputFilenames.size() > 1; - std::for_each(InputFilenames.begin(), InputFilenames.end(), - printFileSectionSizes); - if (OutputFormat == berkeley && TotalSizes) - printBerkelyTotals(); - + InputFilenames.push_back("a.out"); + + MoreThanOneFile = InputFilenames.size() > 1; + llvm::for_each(InputFilenames, printFileSectionSizes); + if (OutputFormat == berkeley && TotalSizes) + printBerkelyTotals(); + if (HadError) return 1; } diff --git a/unittests/ADT/STLExtrasTest.cpp b/unittests/ADT/STLExtrasTest.cpp index 2e6eb6f413f..68cd9f5d2c8 100644 --- a/unittests/ADT/STLExtrasTest.cpp +++ b/unittests/ADT/STLExtrasTest.cpp @@ -252,12 +252,20 @@ TEST(STLExtrasTest, CountAdaptor) { EXPECT_EQ(3, count(v, 1)); EXPECT_EQ(2, count(v, 2)); EXPECT_EQ(1, count(v, 3)); - EXPECT_EQ(1, count(v, 4)); -} - -TEST(STLExtrasTest, ToVector) { - std::vector v = {'a', 'b', 'c'}; - auto Enumerated = to_vector<4>(enumerate(v)); + EXPECT_EQ(1, count(v, 4)); +} + +TEST(STLExtrasTest, for_each) { + std::vector v{ 0, 1, 2, 3, 4 }; + int count = 0; + + llvm::for_each(v, [&count](int) { ++count; }); + EXPECT_EQ(5, count); +} + +TEST(STLExtrasTest, ToVector) { + std::vector v = {'a', 'b', 'c'}; + auto Enumerated = to_vector<4>(enumerate(v)); ASSERT_EQ(3u, Enumerated.size()); for (size_t I = 0; I < v.size(); ++I) { EXPECT_EQ(I, Enumerated[I].index()); -- cgit v1.2.3 From 2619256bd715b06c947e862f5f53511795dae1a3 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Fri, 3 Nov 2017 20:05:51 +0000 Subject: Correcting some CRLFs that snuck in with my previous commit; NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317357 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ADT/STLExtras.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h index c42d976f467..1be5bf91385 100644 --- a/include/llvm/ADT/STLExtras.h +++ b/include/llvm/ADT/STLExtras.h @@ -813,12 +813,12 @@ void DeleteContainerSeconds(Container &C) { C.clear(); } -/// Provide wrappers to std::for_each which take ranges instead of having to -/// pass begin/end explicitly. -template -UnaryPredicate for_each(R &&Range, UnaryPredicate P) { - return std::for_each(std::begin(Range), std::end(Range), P); -} +/// Provide wrappers to std::for_each which take ranges instead of having to +/// pass begin/end explicitly. +template +UnaryPredicate for_each(R &&Range, UnaryPredicate P) { + return std::for_each(std::begin(Range), std::end(Range), P); +} /// Provide wrappers to std::all_of which take ranges instead of having to pass /// begin/end explicitly. -- cgit v1.2.3 From af481e4f940025c84ce601e68fdedbc1bd22cdd2 Mon Sep 17 00:00:00 2001 From: Martin Storsjo Date: Fri, 3 Nov 2017 20:09:10 +0000 Subject: [llvm-ar] Support an options string that start with a dash Some projects call $AR like "$AR -crs output input1 input2". Differential Revision: https://reviews.llvm.org/D39538 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317358 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-ar/default-add.test | 3 ++- tools/llvm-ar/llvm-ar.cpp | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/test/tools/llvm-ar/default-add.test b/test/tools/llvm-ar/default-add.test index 88719e4efce..68e41c24910 100644 --- a/test/tools/llvm-ar/default-add.test +++ b/test/tools/llvm-ar/default-add.test @@ -4,7 +4,8 @@ RUN: yaml2obj %S/Inputs/coff.yaml -o %t-coff.o RUN: rm -f %t.ar RUN: llvm-ar crs %t.ar %t-macho.o RUN: grep -q __.SYMDEF %t.ar -RUN: llvm-ar crs %t.ar %t-coff.o +Test that an option string prefixed by a dash works. +RUN: llvm-ar -crs %t.ar %t-coff.o RUN: grep -q __.SYMDEF %t.ar RUN: rm -f %t.ar diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp index 576265cfe59..8c19f6b6af8 100644 --- a/tools/llvm-ar/llvm-ar.cpp +++ b/tools/llvm-ar/llvm-ar.cpp @@ -127,6 +127,8 @@ static cl::extrahelp MoreHelp( " [v] - be verbose about actions taken\n" ); +static const char OptionChars[] = "dmpqrtxabiosSTucv"; + // This enumeration delineates the kinds of operations on an archive // that are permitted. enum ArchiveOperation { @@ -864,6 +866,24 @@ int main(int argc, char **argv) { Stem.find("lib") != StringRef::npos) return libDriverMain(makeArrayRef(argv, argc)); + for (int i = 1; i < argc; i++) { + // If an argument starts with a dash and only contains chars + // that belong to the options chars set, remove the dash. + // We can't handle it after the command line options parsing + // is done, since it will error out on an unrecognized string + // starting with a dash. + // Make sure this doesn't match the actual llvm-ar specific options + // that start with a dash. + StringRef S = argv[i]; + if (S.startswith("-") && + S.find_first_not_of(OptionChars, 1) == StringRef::npos) { + argv[i]++; + break; + } + if (S == "--") + break; + } + // Have the command line options parsed and handle things // like --help and --version. cl::ParseCommandLineOptions(argc, argv, -- cgit v1.2.3 From ceb5b1b4346ad8e1b2f693199153a5e68c784077 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Fri, 3 Nov 2017 20:24:19 +0000 Subject: Modularize: Include some required headers DenseMaps require the definition of a type to be available when using a pointer to that type as a key to know how many bits are available for tombstone/etc. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317360 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/StackMaps.h | 2 +- lib/Bitcode/Writer/ValueEnumerator.h | 2 ++ lib/CodeGen/AsmPrinter/DwarfFile.h | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/llvm/CodeGen/StackMaps.h b/include/llvm/CodeGen/StackMaps.h index 8263946ed92..4407114d274 100644 --- a/include/llvm/CodeGen/StackMaps.h +++ b/include/llvm/CodeGen/StackMaps.h @@ -14,6 +14,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/IR/CallingConv.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/Support/Debug.h" #include #include @@ -25,7 +26,6 @@ namespace llvm { class AsmPrinter; class MCExpr; class MCStreamer; -class MCSymbol; class raw_ostream; class TargetRegisterInfo; diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h index 730187087dc..011356c3260 100644 --- a/lib/Bitcode/Writer/ValueEnumerator.h +++ b/lib/Bitcode/Writer/ValueEnumerator.h @@ -18,6 +18,8 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/UniqueVector.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Type.h" #include "llvm/IR/UseListOrder.h" #include #include diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h index 6e4625ba411..167ca13c19c 100644 --- a/lib/CodeGen/AsmPrinter/DwarfFile.h +++ b/lib/CodeGen/AsmPrinter/DwarfFile.h @@ -15,6 +15,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/DIE.h" +#include "llvm/IR/Metadata.h" #include "llvm/Support/Allocator.h" #include #include @@ -27,7 +28,6 @@ class DwarfCompileUnit; class DwarfUnit; class LexicalScope; class MCSection; -class MDNode; class DwarfFile { // Target of Dwarf emission, used for sizing of abbreviations. -- cgit v1.2.3 From f4beb75be0ff7db0d9c80bbb0efddcd20e7b1d59 Mon Sep 17 00:00:00 2001 From: Jun Bum Lim Date: Fri, 3 Nov 2017 20:41:16 +0000 Subject: Recommit r317351 : Add CallSiteSplitting pass This recommit r317351 after fixing a buildbot failure. Original commit message: Summary: This change add a pass which tries to split a call-site to pass more constrained arguments if its argument is predicated in the control flow so that we can expose better context to the later passes (e.g, inliner, jump threading, or IPA-CP based function cloning, etc.). As of now we support two cases : 1) If a call site is dominated by an OR condition and if any of its arguments are predicated on this OR condition, try to split the condition with more constrained arguments. For example, in the code below, we try to split the call site since we can predicate the argument (ptr) based on the OR condition. Split from : if (!ptr || c) callee(ptr); to : if (!ptr) callee(null ptr) // set the known constant value else if (c) callee(nonnull ptr) // set non-null attribute in the argument 2) We can also split a call-site based on constant incoming values of a PHI For example, from : BB0: %c = icmp eq i32 %i1, %i2 br i1 %c, label %BB2, label %BB1 BB1: br label %BB2 BB2: %p = phi i32 [ 0, %BB0 ], [ 1, %BB1 ] call void @bar(i32 %p) to BB0: %c = icmp eq i32 %i1, %i2 br i1 %c, label %BB2-split0, label %BB1 BB1: br label %BB2-split1 BB2-split0: call void @bar(i32 0) br label %BB2 BB2-split1: call void @bar(i32 1) br label %BB2 BB2: %p = phi i32 [ 0, %BB2-split0 ], [ 1, %BB2-split1 ] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317362 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/InitializePasses.h | 1 + include/llvm/Transforms/Scalar.h | 8 + include/llvm/Transforms/Scalar/CallSiteSplitting.h | 29 ++ lib/Passes/PassBuilder.cpp | 9 +- lib/Passes/PassRegistry.def | 1 + lib/Transforms/IPO/PassManagerBuilder.cpp | 6 + lib/Transforms/Scalar/CMakeLists.txt | 1 + lib/Transforms/Scalar/CallSiteSplitting.cpp | 493 +++++++++++++++++++++ lib/Transforms/Scalar/Scalar.cpp | 1 + test/Other/new-pm-defaults.ll | 1 + test/Other/new-pm-lto-defaults.ll | 9 +- test/Other/new-pm-thinlto-defaults.ll | 1 + .../CallSiteSplitting/callsite-split-or-phi.ll | 339 ++++++++++++++ .../Transforms/CallSiteSplitting/callsite-split.ll | 119 +++++ 14 files changed, 1015 insertions(+), 3 deletions(-) create mode 100644 include/llvm/Transforms/Scalar/CallSiteSplitting.h create mode 100644 lib/Transforms/Scalar/CallSiteSplitting.cpp create mode 100644 test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll create mode 100644 test/Transforms/CallSiteSplitting/callsite-split.ll diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index b8183d1c8e2..9cdb49330ae 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -80,6 +80,7 @@ void initializeBranchFolderPassPass(PassRegistry&); void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry&); void initializeBranchRelaxationPass(PassRegistry&); void initializeBreakCriticalEdgesPass(PassRegistry&); +void initializeCallSiteSplittingLegacyPassPass(PassRegistry&); void initializeCFGOnlyPrinterLegacyPassPass(PassRegistry&); void initializeCFGOnlyViewerLegacyPassPass(PassRegistry&); void initializeCFGPrinterLegacyPassPass(PassRegistry&); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index a78c897683f..0cf1115dc97 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -73,6 +73,14 @@ FunctionPass *createDeadCodeEliminationPass(); // FunctionPass *createDeadStoreEliminationPass(); + +//===----------------------------------------------------------------------===// +// +// CallSiteSplitting - This pass split call-site based on its known argument +// values. +FunctionPass *createCallSiteSplittingPass(); + + //===----------------------------------------------------------------------===// // // AggressiveDCE - This pass uses the SSA based Aggressive DCE algorithm. This diff --git a/include/llvm/Transforms/Scalar/CallSiteSplitting.h b/include/llvm/Transforms/Scalar/CallSiteSplitting.h new file mode 100644 index 00000000000..5ab951a49f2 --- /dev/null +++ b/include/llvm/Transforms/Scalar/CallSiteSplitting.h @@ -0,0 +1,29 @@ +//===- CallSiteSplitting..h - Callsite Splitting ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H +#define LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H + +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/Compiler.h" +#include + +namespace llvm { + +struct CallSiteSplittingPass : PassInfoMixin { + /// \brief Run the pass over the function. + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index 21d95a07125..2088ea0cea2 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -89,6 +89,7 @@ #include "llvm/Transforms/Scalar/ADCE.h" #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" #include "llvm/Transforms/Scalar/BDCE.h" +#include "llvm/Transforms/Scalar/CallSiteSplitting.h" #include "llvm/Transforms/Scalar/ConstantHoisting.h" #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" #include "llvm/Transforms/Scalar/DCE.h" @@ -548,6 +549,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, EarlyFPM.addPass(SROA()); EarlyFPM.addPass(EarlyCSEPass()); EarlyFPM.addPass(LowerExpectIntrinsicPass()); + if (Level == O3) + EarlyFPM.addPass(CallSiteSplittingPass()); + // In SamplePGO ThinLTO backend, we need instcombine before profile annotation // to convert bitcast to direct calls so that they can be inlined during the // profile annotation prepration step. @@ -920,13 +924,16 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, MPM.addPass(InferFunctionAttrsPass()); if (Level > 1) { + FunctionPassManager EarlyFPM(DebugLogging); + EarlyFPM.addPass(CallSiteSplittingPass()); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM))); + // Indirect call promotion. This should promote all the targets that are // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should // produce the same result as if we only do promotion here. MPM.addPass(PGOIndirectCallPromotion( true /* InLTO */, PGOOpt && !PGOOpt->SampleProfileFile.empty())); - // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function // pointers passed as arguments to direct uses of functions. diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def index 20d1220ac33..40b884351fd 100644 --- a/lib/Passes/PassRegistry.def +++ b/lib/Passes/PassRegistry.def @@ -140,6 +140,7 @@ FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass()) FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass()) FUNCTION_PASS("bdce", BDCEPass()) FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass()) +FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass()) FUNCTION_PASS("consthoist", ConstantHoistingPass()) FUNCTION_PASS("correlated-propagation", CorrelatedValuePropagationPass()) FUNCTION_PASS("dce", DCEPass()) diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 828eb5eee29..b8ff614f7c8 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -467,6 +467,9 @@ void PassManagerBuilder::populateModulePassManager( addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); + if (OptLevel > 2) + MPM.add(createCallSiteSplittingPass()); + MPM.add(createIPSCCPPass()); // IP SCCP MPM.add(createCalledValuePropagationPass()); MPM.add(createGlobalOptimizerPass()); // Optimize out global vars @@ -703,6 +706,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createInferFunctionAttrsLegacyPass()); if (OptLevel > 1) { + // Split call-site with more constrained arguments. + PM.add(createCallSiteSplittingPass()); + // Indirect call promotion. This should promote all the targets that are // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index d79ae851005..6a27fbca8b7 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -2,6 +2,7 @@ add_llvm_library(LLVMScalarOpts ADCE.cpp AlignmentFromAssumptions.cpp BDCE.cpp + CallSiteSplitting.cpp ConstantHoisting.cpp ConstantProp.cpp CorrelatedValuePropagation.cpp diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp new file mode 100644 index 00000000000..2224cb2eb62 --- /dev/null +++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -0,0 +1,493 @@ +//===- CallSiteSplitting.cpp ----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a transformation that tries to split a call-site to pass +// more constrained arguments if its argument is predicated in the control flow +// so that we can expose better context to the later passes (e.g, inliner, jump +// threading, or IPA-CP based function cloning, etc.). +// As of now we support two cases : +// +// 1) If a call site is dominated by an OR condition and if any of its arguments +// are predicated on this OR condition, try to split the condition with more +// constrained arguments. For example, in the code below, we try to split the +// call site since we can predicate the argument(ptr) based on the OR condition. +// +// Split from : +// if (!ptr || c) +// callee(ptr); +// to : +// if (!ptr) +// callee(null) // set the known constant value +// else if (c) +// callee(nonnull ptr) // set non-null attribute in the argument +// +// 2) We can also split a call-site based on constant incoming values of a PHI +// For example, +// from : +// Header: +// %c = icmp eq i32 %i1, %i2 +// br i1 %c, label %Tail, label %TBB +// TBB: +// br label Tail% +// Tail: +// %p = phi i32 [ 0, %Header], [ 1, %TBB] +// call void @bar(i32 %p) +// to +// Header: +// %c = icmp eq i32 %i1, %i2 +// br i1 %c, label %Tail-split0, label %TBB +// TBB: +// br label %Tail-split1 +// Tail-split0: +// call void @bar(i32 0) +// br label %Tail +// Tail-split1: +// call void @bar(i32 1) +// br label %Tail +// Tail: +// %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ] +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/CallSiteSplitting.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "callsite-splitting" + +STATISTIC(NumCallSiteSplit, "Number of call-site split"); + +static void addNonNullAttribute(Instruction *CallI, Instruction *&NewCallI, + Value *Op) { + if (!NewCallI) + NewCallI = CallI->clone(); + CallSite CS(NewCallI); + unsigned ArgNo = 0; + for (auto &I : CS.args()) { + if (&*I == Op) + CS.addParamAttr(ArgNo, Attribute::NonNull); + ++ArgNo; + } +} + +static void setConstantInArgument(Instruction *CallI, Instruction *&NewCallI, + Value *Op, Constant *ConstValue) { + if (!NewCallI) + NewCallI = CallI->clone(); + CallSite CS(NewCallI); + unsigned ArgNo = 0; + for (auto &I : CS.args()) { + if (&*I == Op) + CS.setArgument(ArgNo, ConstValue); + ++ArgNo; + } +} + +static bool createCallSitesOnOrPredicatedArgument( + CallSite CS, Instruction *&NewCSTakenFromHeader, + Instruction *&NewCSTakenFromNextCond, + SmallVectorImpl &BranchInsts, BasicBlock *HeaderBB) { + assert(BranchInsts.size() <= 2 && + "Unexpected number of blocks in the OR predicated condition"); + Instruction *Instr = CS.getInstruction(); + BasicBlock *CallSiteBB = Instr->getParent(); + TerminatorInst *HeaderTI = HeaderBB->getTerminator(); + bool IsCSInTakenPath = CallSiteBB == HeaderTI->getSuccessor(0); + + for (unsigned I = 0, E = BranchInsts.size(); I != E; ++I) { + BranchInst *PBI = BranchInsts[I]; + assert(isa(PBI->getCondition()) && + "Unexpected condition in a conditional branch."); + ICmpInst *Cmp = cast(PBI->getCondition()); + Value *Arg = Cmp->getOperand(0); + assert(isa(Cmp->getOperand(1)) && + "Expected op1 to be a constant."); + Constant *ConstVal = cast(Cmp->getOperand(1)); + CmpInst::Predicate Pred = Cmp->getPredicate(); + + if (PBI->getParent() == HeaderBB) { + Instruction *&CallTakenFromHeader = + IsCSInTakenPath ? NewCSTakenFromHeader : NewCSTakenFromNextCond; + Instruction *&CallUntakenFromHeader = + IsCSInTakenPath ? NewCSTakenFromNextCond : NewCSTakenFromHeader; + + assert(Pred == ICmpInst::ICMP_EQ || + Pred == ICmpInst::ICMP_NE && + "Unexpected predicate in an OR condition"); + + // Set the constant value for agruments in the call predicated based on + // the OR condition. + Instruction *&CallToSetConst = Pred == ICmpInst::ICMP_EQ + ? CallTakenFromHeader + : CallUntakenFromHeader; + setConstantInArgument(Instr, CallToSetConst, Arg, ConstVal); + + // Add the NonNull attribute if compared with the null pointer. + if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) { + Instruction *&CallToSetAttr = Pred == ICmpInst::ICMP_EQ + ? CallUntakenFromHeader + : CallTakenFromHeader; + addNonNullAttribute(Instr, CallToSetAttr, Arg); + } + continue; + } + + if (Pred == ICmpInst::ICMP_EQ) { + if (PBI->getSuccessor(0) == Instr->getParent()) { + // Set the constant value for the call taken from the second block in + // the OR condition. + setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal); + } else { + // Add the NonNull attribute if compared with the null pointer for the + // call taken from the second block in the OR condition. + if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) + addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg); + } + } else { + if (PBI->getSuccessor(0) == Instr->getParent()) { + // Add the NonNull attribute if compared with the null pointer for the + // call taken from the second block in the OR condition. + if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) + addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg); + } else if (Pred == ICmpInst::ICMP_NE) { + // Set the constant value for the call in the untaken path from the + // header block. + setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal); + } else + llvm_unreachable("Unexpected condition"); + } + } + return NewCSTakenFromHeader || NewCSTakenFromNextCond; +} + +static bool canSplitCallSite(CallSite CS) { + // FIXME: As of now we handle only CallInst. InvokeInst could be handled + // without too much effort. + Instruction *Instr = CS.getInstruction(); + if (!isa(Instr)) + return false; + + // Allow splitting a call-site only when there is no instruction before the + // call-site in the basic block. Based on this constraint, we only clone the + // call instruction, and we do not move a call-site across any other + // instruction. + BasicBlock *CallSiteBB = Instr->getParent(); + if (Instr != CallSiteBB->getFirstNonPHI()) + return false; + + pred_iterator PII = pred_begin(CallSiteBB); + pred_iterator PIE = pred_end(CallSiteBB); + unsigned NumPreds = std::distance(PII, PIE); + + // Allow only one extra call-site. No more than two from one call-site. + if (NumPreds != 2) + return false; + + // Cannot split an edge from an IndirectBrInst. + BasicBlock *Preds[2] = {*PII++, *PII}; + if (isa(Preds[0]->getTerminator()) || + isa(Preds[1]->getTerminator())) + return false; + + return CallSiteBB->canSplitPredecessors(); +} + +/// Return true if the CS is split into its new predecessors which are directly +/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2. +/// Note that PredBB1 and PredBB2 are decided in findPredicatedArgument(), +/// especially for the OR predicated case where PredBB1 will point the header, +/// and PredBB2 will point the the second compare block. CallInst1 and CallInst2 +/// will be the new call-sites placed in the new predecessors split for PredBB1 +/// and PredBB2, repectively. Therefore, CallInst1 will be the call-site placed +/// between Header and Tail, and CallInst2 will be the call-site between TBB and +/// Tail. For example, in the IR below with an OR condition, the call-site can +/// be split +/// +/// from : +/// +/// Header: +/// %c = icmp eq i32* %a, null +/// br i1 %c %Tail, %TBB +/// TBB: +/// %c2 = icmp eq i32* %b, null +/// br i1 %c %Tail, %End +/// Tail: +/// %ca = call i1 @callee (i32* %a, i32* %b) +/// +/// to : +/// +/// Header: // PredBB1 is Header +/// %c = icmp eq i32* %a, null +/// br i1 %c %Tail-split1, %TBB +/// TBB: // PredBB2 is TBB +/// %c2 = icmp eq i32* %b, null +/// br i1 %c %Tail-split2, %End +/// Tail-split1: +/// %ca1 = call @callee (i32* null, i32* %b) // CallInst1 +/// br %Tail +/// Tail-split2: +/// %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2 +/// br %Tail +/// Tail: +/// %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2] +/// +/// Note that for an OR predicated case, CallInst1 and CallInst2 should be +/// created with more constrained arguments in +/// createCallSitesOnOrPredicatedArgument(). +static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2, + Instruction *CallInst1, Instruction *CallInst2) { + Instruction *Instr = CS.getInstruction(); + BasicBlock *TailBB = Instr->getParent(); + assert(Instr == (TailBB->getFirstNonPHI()) && "Unexpected call-site"); + + BasicBlock *SplitBlock1 = + SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split"); + BasicBlock *SplitBlock2 = + SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split"); + + assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split."); + + if (!CallInst1) + CallInst1 = Instr->clone(); + if (!CallInst2) + CallInst2 = Instr->clone(); + + CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt()); + CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt()); + + CallSite CS1(CallInst1); + CallSite CS2(CallInst2); + + // Handle PHIs used as arguments in the call-site. + for (auto &PI : *TailBB) { + PHINode *PN = dyn_cast(&PI); + if (!PN) + break; + unsigned ArgNo = 0; + for (auto &CI : CS.args()) { + if (&*CI == PN) { + CS1.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock1)); + CS2.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock2)); + } + ++ArgNo; + } + } + + // Replace users of the original call with a PHI mering call-sites split. + if (Instr->getNumUses()) { + PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call", Instr); + PN->addIncoming(CallInst1, SplitBlock1); + PN->addIncoming(CallInst2, SplitBlock2); + Instr->replaceAllUsesWith(PN); + } + DEBUG(dbgs() << "split call-site : " << *Instr << " into \n"); + DEBUG(dbgs() << " " << *CallInst1 << " in " << SplitBlock1->getName() + << "\n"); + DEBUG(dbgs() << " " << *CallInst2 << " in " << SplitBlock2->getName() + << "\n"); + Instr->eraseFromParent(); + NumCallSiteSplit++; +} + +static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) { + assert(isa(Cmp->getOperand(1)) && "Expected a constant operand."); + Value *Op0 = Cmp->getOperand(0); + unsigned ArgNo = 0; + for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; + ++I, ++ArgNo) { + // Don't consider constant or arguments that are already known non-null. + if (isa(*I) || CS.paramHasAttr(ArgNo, Attribute::NonNull)) + continue; + + if (*I == Op0) + return true; + } + return false; +} + +static void findOrCondRelevantToCallArgument( + CallSite CS, BasicBlock *PredBB, BasicBlock *OtherPredBB, + SmallVectorImpl &BranchInsts, BasicBlock *&HeaderBB) { + auto *PBI = dyn_cast(PredBB->getTerminator()); + if (!PBI || !PBI->isConditional()) + return; + + if (PBI->getSuccessor(0) == OtherPredBB || + PBI->getSuccessor(1) == OtherPredBB) + if (PredBB == OtherPredBB->getSinglePredecessor()) { + assert(!HeaderBB && "Expect to find only a single header block"); + HeaderBB = PredBB; + } + + CmpInst::Predicate Pred; + Value *Cond = PBI->getCondition(); + if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant()))) + return; + ICmpInst *Cmp = cast(Cond); + if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) + if (isCondRelevantToAnyCallArgument(Cmp, CS)) + BranchInsts.push_back(PBI); +} + +// Return true if the call-site has an argument which is a PHI with only +// constant incoming values. +static bool isPredicatedOnPHI(CallSite CS) { + Instruction *Instr = CS.getInstruction(); + BasicBlock *Parent = Instr->getParent(); + if (Instr != Parent->getFirstNonPHI()) + return false; + + for (auto &BI : *Parent) { + if (PHINode *PN = dyn_cast(&BI)) { + for (auto &I : CS.args()) + if (&*I == PN) { + assert(PN->getNumIncomingValues() == 2 && + "Unexpected number of incoming values"); + if (PN->getIncomingBlock(0) == PN->getIncomingBlock(1)) + return false; + if (PN->getIncomingValue(0) == PN->getIncomingValue(1)) + continue; + if (isa(PN->getIncomingValue(0)) && + isa(PN->getIncomingValue(1))) + return true; + } + } + break; + } + return false; +} + +// Return true if an agument in CS is predicated on an 'or' condition. +// Create new call-site with arguments constrained based on the OR condition. +static bool findPredicatedOnOrCondition(CallSite CS, BasicBlock *PredBB1, + BasicBlock *PredBB2, + Instruction *&NewCallTakenFromHeader, + Instruction *&NewCallTakenFromNextCond, + BasicBlock *&HeaderBB) { + SmallVector BranchInsts; + findOrCondRelevantToCallArgument(CS, PredBB1, PredBB2, BranchInsts, HeaderBB); + findOrCondRelevantToCallArgument(CS, PredBB2, PredBB1, BranchInsts, HeaderBB); + if (BranchInsts.empty() || !HeaderBB) + return false; + + // If an OR condition is detected, try to create call sites with constrained + // arguments (e.g., NonNull attribute or constant value). + return createCallSitesOnOrPredicatedArgument(CS, NewCallTakenFromHeader, + NewCallTakenFromNextCond, + BranchInsts, HeaderBB); +} + +static bool findPredicatedArgument(CallSite CS, Instruction *&CallInst1, + Instruction *&CallInst2, + BasicBlock *&PredBB1, BasicBlock *&PredBB2) { + BasicBlock *CallSiteBB = CS.getInstruction()->getParent(); + pred_iterator PII = pred_begin(CallSiteBB); + pred_iterator PIE = pred_end(CallSiteBB); + assert(std::distance(PII, PIE) == 2 && "Expect only two predecessors."); + (void)PIE; + BasicBlock *Preds[2] = {*PII++, *PII}; + BasicBlock *&HeaderBB = PredBB1; + if (!findPredicatedOnOrCondition(CS, Preds[0], Preds[1], CallInst1, CallInst2, + HeaderBB) && + !isPredicatedOnPHI(CS)) + return false; + + if (!PredBB1) + PredBB1 = Preds[0]; + + PredBB2 = PredBB1 == Preds[0] ? Preds[1] : Preds[0]; + return true; +} + +static bool tryToSplitCallSite(CallSite CS) { + if (!CS.arg_size()) + return false; + + BasicBlock *PredBB1 = nullptr; + BasicBlock *PredBB2 = nullptr; + Instruction *CallInst1 = nullptr; + Instruction *CallInst2 = nullptr; + if (!canSplitCallSite(CS) || + !findPredicatedArgument(CS, CallInst1, CallInst2, PredBB1, PredBB2)) { + assert(!CallInst1 && !CallInst2 && "Unexpected new call-sites cloned."); + return false; + } + splitCallSite(CS, PredBB1, PredBB2, CallInst1, CallInst2); + return true; +} + +static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) { + bool Changed = false; + for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) { + BasicBlock &BB = *BI++; + for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) { + Instruction *I = &*II++; + CallSite CS(cast(I)); + if (!CS || isa(I) || isInstructionTriviallyDead(I, &TLI)) + continue; + + Function *Callee = CS.getCalledFunction(); + if (!Callee || Callee->isDeclaration()) + continue; + Changed |= tryToSplitCallSite(CS); + } + } + return Changed; +} + +namespace { +struct CallSiteSplittingLegacyPass : public FunctionPass { + static char ID; + CallSiteSplittingLegacyPass() : FunctionPass(ID) { + initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + FunctionPass::getAnalysisUsage(AU); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + auto &TLI = getAnalysis().getTLI(); + return doCallSiteSplitting(F, TLI); + } +}; +} // namespace + +char CallSiteSplittingLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting", + "Call-site splitting", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting", + "Call-site splitting", false, false) +FunctionPass *llvm::createCallSiteSplittingPass() { + return new CallSiteSplittingLegacyPass(); +} + +PreservedAnalyses CallSiteSplittingPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &TLI = AM.getResult(F); + + if (!doCallSiteSplitting(F, TLI)) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + return PA; +} diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index c1034ace206..8a5ae1b8731 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -35,6 +35,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCELegacyPassPass(Registry); initializeBDCELegacyPassPass(Registry); initializeAlignmentFromAssumptionsPass(Registry); + initializeCallSiteSplittingLegacyPassPass(Registry); initializeConstantHoistingLegacyPassPass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll index 816f75310e3..0810a13c141 100644 --- a/test/Other/new-pm-defaults.ll +++ b/test/Other/new-pm-defaults.ll @@ -76,6 +76,7 @@ ; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass +; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: IPSCCPPass ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass diff --git a/test/Other/new-pm-lto-defaults.ll b/test/Other/new-pm-lto-defaults.ll index fc52f70ff4c..878198d1447 100644 --- a/test/Other/new-pm-lto-defaults.ll +++ b/test/Other/new-pm-lto-defaults.ll @@ -29,9 +29,14 @@ ; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass ; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis +; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> +; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Module +; CHECK-O2-NEXT: Starting llvm::Function pass manager run. +; CHECK-O2-NEXT: Running pass: CallSiteSplittingPass on foo +; CHECK-O2-NEXT: Running analysis: TargetLibraryAnalysis on foo +; CHECK-O2-NEXT: Finished llvm::Function pass manager run. ; CHECK-O2-NEXT: PGOIndirectCallPromotion ; CHECK-O2-NEXT: Running analysis: ProfileSummaryAnalysis -; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Function ; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis ; CHECK-O2-NEXT: Running pass: IPSCCPPass ; CHECK-O2-NEXT: Running pass: CalledValuePropagationPass @@ -42,7 +47,7 @@ ; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}> ; CHECK-O-NEXT: Running analysis: AAManager -; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis +; CHECK-O1-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass ; CHECK-O-NEXT: Running analysis: CallGraphAnalysis ; CHECK-O-NEXT: Running pass: GlobalSplitPass diff --git a/test/Other/new-pm-thinlto-defaults.ll b/test/Other/new-pm-thinlto-defaults.ll index 7d40ef3eea2..e83f0f87055 100644 --- a/test/Other/new-pm-thinlto-defaults.ll +++ b/test/Other/new-pm-thinlto-defaults.ll @@ -72,6 +72,7 @@ ; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass +; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: IPSCCPPass ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass diff --git a/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll new file mode 100644 index 00000000000..d1d854d8f45 --- /dev/null +++ b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll @@ -0,0 +1,339 @@ +; RUN: opt < %s -callsite-splitting -S | FileCheck %s +; RUN: opt < %s -passes='function(callsite-splitting)' -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linaro-linux-gnueabi" + +;CHECK-LABEL: @test_eq_eq +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_eq_eq(i32* %a, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_eq +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_eq(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_ne +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_ne(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp ne i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_eq_eq_untaken +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_eq_eq_untaken(i32* %a, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, null + br i1 %tobool1, label %TBB, label %Tail + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_eq_untaken +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_eq_untaken(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %TBB, label %Tail + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_ne_untaken +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_ne_untaken(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %TBB, label %Tail + +TBB: + %cmp = icmp ne i32 %v, 1 + br i1 %cmp, label %End, label %Tail + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_nonconst_const_phi +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_nonconst_const_phi(i32* %a, i32* %b, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, %b + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_nonconst_nonconst_phi +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 %v, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_nonconst_nonconst_phi(i32* %a, i32* %b, i32 %v, i32 %v2) { +Header: + %tobool1 = icmp eq i32* %a, %b + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, %v2 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_nonconst_nonconst_phi_noncost +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_nonconst_nonconst_phi_noncost(i32* %a, i32* %b, i32 %v, i32 %v2) { +Header: + %tobool1 = icmp eq i32* %a, %b + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, %v2 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[%v,%Header], [%v2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_fisrtnonphi +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_fisrtnonphi(i32* %a, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + store i32 %v, i32* %a + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_3preds_constphi +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_3preds_constphi(i32* %a, i32 %v, i1 %c1, i1 %c2, i1 %c3) { +Header: + br i1 %c1, label %Tail, label %TBB1 + +TBB1: + br i1 %c2, label %Tail, label %TBB2 + +TBB2: + br i1 %c3, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB1], [3, %TBB2] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_indirectbr_phi +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_indirectbr_phi(i8* %address, i32* %a, i32* %b, i32 %v) { +Header: + %indirect.goto.dest = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address + indirectbr i8* %indirect.goto.dest, [label %TBB, label %Tail] + +TBB: + %indirect.goto.dest2 = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address + indirectbr i8* %indirect.goto.dest2, [label %Tail, label %End] + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +define i32 @callee(i32* %a, i32 %v, i32 %p) { +entry: + %c = icmp ne i32* %a, null + br i1 %c, label %BB1, label %BB2 + +BB1: + call void @dummy(i32* %a, i32 %p) + br label %End + +BB2: + call void @dummy2(i32 %v, i32 %p) + br label %End + +End: + ret i32 %p +} + +declare void @dummy(i32*, i32) +declare void @dummy2(i32, i32) diff --git a/test/Transforms/CallSiteSplitting/callsite-split.ll b/test/Transforms/CallSiteSplitting/callsite-split.ll new file mode 100644 index 00000000000..419fa738563 --- /dev/null +++ b/test/Transforms/CallSiteSplitting/callsite-split.ll @@ -0,0 +1,119 @@ +; RUN: opt < %s -callsite-splitting -inline -instcombine -jump-threading -S | FileCheck %s +; RUN: opt < %s -passes='function(callsite-splitting),cgscc(inline),function(instcombine,jump-threading)' -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linaro-linux-gnueabi" + +%struct.bitmap = type { i32, %struct.bitmap* } + +;CHECK-LABEL: @caller +;CHECK-LABEL: NextCond: +;CHECK: br {{.*}} label %callee.exit +;CHECK-LABEL: CallSiteBB.predBB1.split: +;CHECK: call void @callee(%struct.bitmap* null, %struct.bitmap* null, %struct.bitmap* %b_elt, i1 false) +;CHECK-LABEL: callee.exit: +;CHECK: call void @dummy2(%struct.bitmap* %a_elt) + +define void @caller(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt) { +entry: + br label %Top + +Top: + %tobool1 = icmp eq %struct.bitmap* %a_elt, null + br i1 %tobool1, label %CallSiteBB, label %NextCond + +NextCond: + %cmp = icmp ne %struct.bitmap* %b_elt, null + br i1 %cmp, label %CallSiteBB, label %End + +CallSiteBB: + %p = phi i1 [0, %Top], [%c, %NextCond] + call void @callee(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %p) + br label %End + +End: + ret void +} + +define void @callee(%struct.bitmap* %dst_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %c) { +entry: + %tobool = icmp ne %struct.bitmap* %a_elt, null + %tobool1 = icmp ne %struct.bitmap* %b_elt, null + %or.cond = and i1 %tobool, %tobool1 + br i1 %or.cond, label %Cond, label %Big + +Cond: + %cmp = icmp eq %struct.bitmap* %dst_elt, %a_elt + br i1 %cmp, label %Small, label %Big + +Small: + call void @dummy2(%struct.bitmap* %a_elt) + br label %End + +Big: + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + br label %End + +End: + ret void +} + +declare void @dummy2(%struct.bitmap*) +declare void @dummy1(%struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*) + + +;CHECK-LABEL: @caller2 +;CHECK-LABEL: CallSiteBB.predBB1.split: +;CHECK: call void @dummy4() +;CHECK-LABEL: CallSiteBB.predBB2.split: +;CHECK: call void @dummy3() +;CheCK-LABEL: CallSiteBB: +;CHECK: %phi.call = phi i1 [ false, %CallSiteBB.predBB1.split ], [ true, %CallSiteBB.predBB2.split ] +;CHECK: call void @foo(i1 %phi.call) +define void @caller2(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, %struct.bitmap* %c_elt) { +entry: + br label %Top + +Top: + %tobool1 = icmp eq %struct.bitmap* %a_elt, %b_elt + br i1 %tobool1, label %CallSiteBB, label %NextCond + +NextCond: + %cmp = icmp ne %struct.bitmap* %b_elt, %c_elt + br i1 %cmp, label %CallSiteBB, label %End + +CallSiteBB: + %phi = phi i1 [0, %Top],[1, %NextCond] + %u = call i1 @callee2(i1 %phi) + call void @foo(i1 %u) + br label %End + +End: + ret void +} + +define i1 @callee2(i1 %b) { +entry: + br i1 %b, label %BB1, label %BB2 + +BB1: + call void @dummy3() + br label %End + +BB2: + call void @dummy4() + br label %End + +End: + ret i1 %b +} + +declare void @dummy3() +declare void @dummy4() +declare void @foo(i1) -- cgit v1.2.3 From 79eed6909a1765163e2abb428a85b670fa3fb454 Mon Sep 17 00:00:00 2001 From: Mitch Phillips Date: Fri, 3 Nov 2017 20:54:26 +0000 Subject: [cfi-verify] Add blacklist parsing for result filtering. Adds blacklist parsing behaviour for filtering results into four categories: - Expected Protected: Things that are not in the blacklist and are protected. - Unexpected Protected: Things that are in the blacklist and are protected. - Expected Unprotected: Things that are in the blacklist and are unprotected. - Unexpected Unprotected: Things that are not in the blacklist and are unprotected. now can optionally be invoked with a second command line argument, which specifies the blacklist file that the binary was built with. Current statistics for chromium: Reviewers: vlad.tsyrklevich Subscribers: mgorny, llvm-commits, pcc, kcc Differential Revision: https://reviews.llvm.org/D39525 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317364 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../X86/Inputs/protected-lineinfo.s | 195 +++++++++++ .../X86/Inputs/unprotected-fullinfo.s | 380 +++++++++++++++++++++ .../X86/Inputs/unprotected-lineinfo.s | 159 +++++++++ .../X86/Inputs/unprotected-nolineinfo.s | 87 +++++ .../X86/blacklist-expected-unprotected.s | 17 + .../llvm-cfi-verify/X86/blacklist-match-fun.s | 17 + .../X86/blacklist-unexpected-protected.s | 17 + .../llvm-cfi-verify/X86/indirect-cf-elimination.s | 5 +- .../tools/llvm-cfi-verify/X86/protected-lineinfo.s | 204 +---------- .../llvm-cfi-verify/X86/unprotected-lineinfo.s | 168 +-------- .../llvm-cfi-verify/X86/unprotected-nolineinfo.s | 91 +---- tools/llvm-cfi-verify/CMakeLists.txt | 2 +- tools/llvm-cfi-verify/LLVMBuild.txt | 2 +- tools/llvm-cfi-verify/lib/CMakeLists.txt | 3 +- tools/llvm-cfi-verify/lib/FileAnalysis.cpp | 49 +-- tools/llvm-cfi-verify/lib/FileAnalysis.h | 9 +- tools/llvm-cfi-verify/lib/LLVMBuild.txt | 2 +- tools/llvm-cfi-verify/llvm-cfi-verify.cpp | 133 ++++++-- unittests/tools/llvm-cfi-verify/CMakeLists.txt | 1 + unittests/tools/llvm-cfi-verify/FileAnalysis.cpp | 1 + unittests/tools/llvm-cfi-verify/GraphBuilder.cpp | 1 + 21 files changed, 1037 insertions(+), 506 deletions(-) create mode 100644 test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s create mode 100644 test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s create mode 100644 test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s create mode 100644 test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s create mode 100644 test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s create mode 100644 test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s create mode 100644 test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s new file mode 100644 index 00000000000..f8cfcb8d15c --- /dev/null +++ b/test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s @@ -0,0 +1,195 @@ +# Source (tiny.cc): +# void a() {} +# void b() {} +# int main(int argc, char** argv) { +# void(*ptr)(); +# if (argc == 1) +# ptr = &a; +# else +# ptr = &b; +# ptr(); +# } +# Compile with (output is in tiny.s.0): +# clang++ -flto -fsanitize=cfi -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt +# clang++ tiny.o -o tiny -flto -fuse-ld=gold -Wl,-plugin-opt,save-temps +# clang++ -fsanitize=cfi -flto -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt +# llvm-lto2 run @tiny.resolution.txt -o tiny.s -filetype=asm + + .text + .file "ld-temp.o" + .p2align 4, 0x90 + .type _Z1av.cfi,@function +_Z1av.cfi: +.Lfunc_begin0: + .file 1 "tiny.cc" + .loc 1 1 0 + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp +.Ltmp0: + .loc 1 1 11 prologue_end + popq %rbp + retq +.Ltmp1: +.Lfunc_end0: + .size _Z1av.cfi, .Lfunc_end0-_Z1av.cfi + .cfi_endproc + + .p2align 4, 0x90 + .type _Z1bv.cfi,@function +_Z1bv.cfi: +.Lfunc_begin1: + .loc 1 2 0 + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp +.Ltmp2: + .loc 1 2 11 prologue_end + popq %rbp + retq +.Ltmp3: +.Lfunc_end1: + .size _Z1bv.cfi, .Lfunc_end1-_Z1bv.cfi + .cfi_endproc + + .hidden main + .globl main + .p2align 4, 0x90 + .type main,@function +main: +.Lfunc_begin2: + .loc 1 4 0 + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp + subq $32, %rsp + movl $0, -8(%rbp) + movl %edi, -4(%rbp) + movq %rsi, -24(%rbp) +.Ltmp4: + .loc 1 6 12 prologue_end + cmpl $1, -4(%rbp) + .loc 1 6 7 is_stmt 0 + jne .LBB2_2 + .loc 1 0 7 + leaq _Z1av(%rip), %rax + .loc 1 7 9 is_stmt 1 + movq %rax, -16(%rbp) + .loc 1 7 5 is_stmt 0 + jmp .LBB2_3 +.LBB2_2: + .loc 1 0 5 + leaq _Z1bv(%rip), %rax + .loc 1 9 9 is_stmt 1 + movq %rax, -16(%rbp) +.LBB2_3: + .loc 1 0 9 is_stmt 0 + leaq .L.cfi.jumptable(%rip), %rcx + .loc 1 11 3 is_stmt 1 + movq -16(%rbp), %rax + movq %rax, %rdx + subq %rcx, %rdx + movq %rdx, %rcx + shrq $3, %rcx + shlq $61, %rdx + orq %rcx, %rdx + cmpq $1, %rdx + jbe .LBB2_5 + ud2 +.LBB2_5: + callq *%rax + .loc 1 12 1 + movl -8(%rbp), %eax + addq $32, %rsp + popq %rbp + retq +.Ltmp5: +.Lfunc_end2: + .size main, .Lfunc_end2-main + .cfi_endproc + + .p2align 3, 0x90 + .type .L.cfi.jumptable,@function +.L.cfi.jumptable: +.Lfunc_begin3: + .cfi_startproc + #APP + jmp _Z1av.cfi@PLT + int3 + int3 + int3 + jmp _Z1bv.cfi@PLT + int3 + int3 + int3 + + #NO_APP +.Lfunc_end3: + .size .L.cfi.jumptable, .Lfunc_end3-.L.cfi.jumptable + .cfi_endproc + + .section .debug_str,"MS",@progbits,1 +.Linfo_string0: + .asciz "clang version 6.0.0 (trunk 316774)" +.Linfo_string1: + .asciz "tiny.cc" +.Linfo_string2: + .asciz "" + .section .debug_abbrev,"",@progbits + .byte 1 + .byte 17 + .byte 0 + .byte 37 + .byte 14 + .byte 19 + .byte 5 + .byte 3 + .byte 14 + .byte 16 + .byte 23 + .byte 27 + .byte 14 + .byte 17 + .byte 1 + .byte 18 + .byte 6 + .byte 0 + .byte 0 + .byte 0 + .section .debug_info,"",@progbits +.Lcu_begin0: + .long 38 + .short 4 + .long .debug_abbrev + .byte 8 + .byte 1 + .long .Linfo_string0 + .short 4 + .long .Linfo_string1 + .long .Lline_table_start0 + .long .Linfo_string2 + .quad .Lfunc_begin0 + .long .Lfunc_end2-.Lfunc_begin0 + .section .debug_ranges,"",@progbits + .section .debug_macinfo,"",@progbits +.Lcu_macro_begin0: + .byte 0 + + .type _Z1av,@function +_Z1av = .L.cfi.jumptable + .type _Z1bv,@function +_Z1bv = .L.cfi.jumptable+8 + .ident "clang version 6.0.0 (trunk 316774)" + .section ".note.GNU-stack","",@progbits + .section .debug_line,"",@progbits +.Lline_table_start0: + diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s new file mode 100644 index 00000000000..7b5ca07d7e4 --- /dev/null +++ b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s @@ -0,0 +1,380 @@ +# Source (tiny.cc): +# void a() {} +# void b() {} +# int main(int argc, char** argv) { +# void(*ptr)(); +# if (argc == 1) +# ptr = &a; +# else +# ptr = &b; +# ptr(); +# } +# Compile with: +# clang++ -g tiny.cc -S -o tiny.s + + .text + .file "tiny.cc" + .globl _Z1av # -- Begin function _Z1av + .p2align 4, 0x90 + .type _Z1av,@function +_Z1av: # @_Z1av +.Lfunc_begin0: + .file 1 "tiny.cc" + .loc 1 1 0 # tiny.cc:1:0 + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp +.Ltmp0: + .loc 1 1 11 prologue_end # tiny.cc:1:11 + popq %rbp + .cfi_def_cfa %rsp, 8 + retq +.Ltmp1: +.Lfunc_end0: + .size _Z1av, .Lfunc_end0-_Z1av + .cfi_endproc + # -- End function + .globl _Z1bv # -- Begin function _Z1bv + .p2align 4, 0x90 + .type _Z1bv,@function +_Z1bv: # @_Z1bv +.Lfunc_begin1: + .loc 1 2 0 # tiny.cc:2:0 + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp +.Ltmp2: + .loc 1 2 11 prologue_end # tiny.cc:2:11 + popq %rbp + .cfi_def_cfa %rsp, 8 + retq +.Ltmp3: +.Lfunc_end1: + .size _Z1bv, .Lfunc_end1-_Z1bv + .cfi_endproc + # -- End function + .globl main # -- Begin function main + .p2align 4, 0x90 + .type main,@function +main: # @main +.Lfunc_begin2: + .loc 1 4 0 # tiny.cc:4:0 + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp + subq $32, %rsp + movl $0, -4(%rbp) + movl %edi, -8(%rbp) + movq %rsi, -16(%rbp) +.Ltmp4: + .loc 1 6 12 prologue_end # tiny.cc:6:12 + cmpl $1, -8(%rbp) +.Ltmp5: + .loc 1 6 7 is_stmt 0 # tiny.cc:6:7 + jne .LBB2_2 +# BB#1: + .loc 1 0 7 # tiny.cc:0:7 + movabsq $_Z1av, %rax +.Ltmp6: + .loc 1 7 9 is_stmt 1 # tiny.cc:7:9 + movq %rax, -24(%rbp) + .loc 1 7 5 is_stmt 0 # tiny.cc:7:5 + jmp .LBB2_3 +.LBB2_2: + .loc 1 0 5 # tiny.cc:0:5 + movabsq $_Z1bv, %rax + .loc 1 9 9 is_stmt 1 # tiny.cc:9:9 + movq %rax, -24(%rbp) +.Ltmp7: +.LBB2_3: + .loc 1 11 3 # tiny.cc:11:3 + callq *-24(%rbp) + .loc 1 12 1 # tiny.cc:12:1 + movl -4(%rbp), %eax + addq $32, %rsp + popq %rbp + .cfi_def_cfa %rsp, 8 + retq +.Ltmp8: +.Lfunc_end2: + .size main, .Lfunc_end2-main + .cfi_endproc + # -- End function + .section .debug_str,"MS",@progbits,1 +.Linfo_string0: + .asciz "clang version 6.0.0 (trunk 317104)" # string offset=0 +.Linfo_string1: + .asciz "tiny.cc" # string offset=35 +.Linfo_string2: + .asciz "/tmp/a/b" # string offset=43 +.Linfo_string3: + .asciz "_Z1av" # string offset=52 +.Linfo_string4: + .asciz "a" # string offset=58 +.Linfo_string5: + .asciz "_Z1bv" # string offset=60 +.Linfo_string6: + .asciz "b" # string offset=66 +.Linfo_string7: + .asciz "main" # string offset=68 +.Linfo_string8: + .asciz "int" # string offset=73 +.Linfo_string9: + .asciz "argc" # string offset=77 +.Linfo_string10: + .asciz "argv" # string offset=82 +.Linfo_string11: + .asciz "char" # string offset=87 +.Linfo_string12: + .asciz "ptr" # string offset=92 + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 14 # DW_FORM_strp + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 14 # DW_FORM_strp + .ascii "\264B" # DW_AT_GNU_pubnames + .byte 25 # DW_FORM_flag_present + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 0 # DW_CHILDREN_no + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 110 # DW_AT_linkage_name + .byte 14 # DW_FORM_strp + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 1 # DW_CHILDREN_yes + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 4 # Abbreviation Code + .byte 5 # DW_TAG_formal_parameter + .byte 0 # DW_CHILDREN_no + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 5 # Abbreviation Code + .byte 52 # DW_TAG_variable + .byte 0 # DW_CHILDREN_no + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 6 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 7 # Abbreviation Code + .byte 15 # DW_TAG_pointer_type + .byte 0 # DW_CHILDREN_no + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 8 # Abbreviation Code + .byte 21 # DW_TAG_subroutine_type + .byte 0 # DW_CHILDREN_no + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long 187 # Length of Unit + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0xb4 DW_TAG_compile_unit + .long .Linfo_string0 # DW_AT_producer + .short 4 # DW_AT_language + .long .Linfo_string1 # DW_AT_name + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Linfo_string2 # DW_AT_comp_dir + # DW_AT_GNU_pubnames + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end2-.Lfunc_begin0 # DW_AT_high_pc + .byte 2 # Abbrev [2] 0x2a:0x19 DW_TAG_subprogram + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .long .Linfo_string3 # DW_AT_linkage_name + .long .Linfo_string4 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 1 # DW_AT_decl_line + # DW_AT_external + .byte 2 # Abbrev [2] 0x43:0x19 DW_TAG_subprogram + .quad .Lfunc_begin1 # DW_AT_low_pc + .long .Lfunc_end1-.Lfunc_begin1 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .long .Linfo_string5 # DW_AT_linkage_name + .long .Linfo_string6 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 2 # DW_AT_decl_line + # DW_AT_external + .byte 3 # Abbrev [3] 0x5c:0x44 DW_TAG_subprogram + .quad .Lfunc_begin2 # DW_AT_low_pc + .long .Lfunc_end2-.Lfunc_begin2 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .long .Linfo_string7 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 4 # DW_AT_decl_line + .long 160 # DW_AT_type + # DW_AT_external + .byte 4 # Abbrev [4] 0x75:0xe DW_TAG_formal_parameter + .byte 2 # DW_AT_location + .byte 145 + .byte 120 + .long .Linfo_string9 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 4 # DW_AT_decl_line + .long 160 # DW_AT_type + .byte 4 # Abbrev [4] 0x83:0xe DW_TAG_formal_parameter + .byte 2 # DW_AT_location + .byte 145 + .byte 112 + .long .Linfo_string10 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 4 # DW_AT_decl_line + .long 167 # DW_AT_type + .byte 5 # Abbrev [5] 0x91:0xe DW_TAG_variable + .byte 2 # DW_AT_location + .byte 145 + .byte 104 + .long .Linfo_string12 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 5 # DW_AT_decl_line + .long 184 # DW_AT_type + .byte 0 # End Of Children Mark + .byte 6 # Abbrev [6] 0xa0:0x7 DW_TAG_base_type + .long .Linfo_string8 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 7 # Abbrev [7] 0xa7:0x5 DW_TAG_pointer_type + .long 172 # DW_AT_type + .byte 7 # Abbrev [7] 0xac:0x5 DW_TAG_pointer_type + .long 177 # DW_AT_type + .byte 6 # Abbrev [6] 0xb1:0x7 DW_TAG_base_type + .long .Linfo_string11 # DW_AT_name + .byte 6 # DW_AT_encoding + .byte 1 # DW_AT_byte_size + .byte 7 # Abbrev [7] 0xb8:0x5 DW_TAG_pointer_type + .long 189 # DW_AT_type + .byte 8 # Abbrev [8] 0xbd:0x1 DW_TAG_subroutine_type + .byte 0 # End Of Children Mark + .section .debug_ranges,"",@progbits + .section .debug_macinfo,"",@progbits +.Lcu_macro_begin0: + .byte 0 # End Of Macro List Mark + .section .debug_pubnames,"",@progbits + .long .LpubNames_end0-.LpubNames_begin0 # Length of Public Names Info +.LpubNames_begin0: + .short 2 # DWARF Version + .long .Lcu_begin0 # Offset of Compilation Unit Info + .long 191 # Compilation Unit Length + .long 42 # DIE offset + .asciz "a" # External Name + .long 67 # DIE offset + .asciz "b" # External Name + .long 92 # DIE offset + .asciz "main" # External Name + .long 0 # End Mark +.LpubNames_end0: + .section .debug_pubtypes,"",@progbits + .long .LpubTypes_end0-.LpubTypes_begin0 # Length of Public Types Info +.LpubTypes_begin0: + .short 2 # DWARF Version + .long .Lcu_begin0 # Offset of Compilation Unit Info + .long 191 # Compilation Unit Length + .long 160 # DIE offset + .asciz "int" # External Name + .long 177 # DIE offset + .asciz "char" # External Name + .long 0 # End Mark +.LpubTypes_end0: + + .ident "clang version 6.0.0 (trunk 317104)" + .section ".note.GNU-stack","",@progbits + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s new file mode 100644 index 00000000000..155f5978b46 --- /dev/null +++ b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s @@ -0,0 +1,159 @@ +# Source (tiny.cc): +# void a() {} +# void b() {} +# int main(int argc, char** argv) { +# void(*ptr)(); +# if (argc == 1) +# ptr = &a; +# else +# ptr = &b; +# ptr(); +# } +# Compile with: +# clang++ -gmlt tiny.cc -S -o tiny.s + + .text + .file "tiny.cc" + .globl _Z1av # -- Begin function _Z1av + .p2align 4, 0x90 + .type _Z1av,@function +_Z1av: # @_Z1av +.Lfunc_begin0: + .file 1 "tiny.cc" + .loc 1 1 0 # tiny.cc:1:0 + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp +.Ltmp0: + .loc 1 1 11 prologue_end # tiny.cc:1:11 + popq %rbp + retq +.Ltmp1: +.Lfunc_end0: + .size _Z1av, .Lfunc_end0-_Z1av + .cfi_endproc + # -- End function + .globl _Z1bv # -- Begin function _Z1bv + .p2align 4, 0x90 + .type _Z1bv,@function +_Z1bv: # @_Z1bv +.Lfunc_begin1: + .loc 1 2 0 # tiny.cc:2:0 + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp +.Ltmp2: + .loc 1 2 11 prologue_end # tiny.cc:2:11 + popq %rbp + retq +.Ltmp3: +.Lfunc_end1: + .size _Z1bv, .Lfunc_end1-_Z1bv + .cfi_endproc + # -- End function + .globl main # -- Begin function main + .p2align 4, 0x90 + .type main,@function +main: # @main +.Lfunc_begin2: + .loc 1 4 0 # tiny.cc:4:0 + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp + subq $32, %rsp + movl $0, -4(%rbp) + movl %edi, -8(%rbp) + movq %rsi, -16(%rbp) +.Ltmp4: + .loc 1 6 12 prologue_end # tiny.cc:6:12 + cmpl $1, -8(%rbp) + .loc 1 6 7 is_stmt 0 # tiny.cc:6:7 + jne .LBB2_2 +# BB#1: + .loc 1 0 7 # tiny.cc:0:7 + movabsq $_Z1av, %rax + .loc 1 7 9 is_stmt 1 # tiny.cc:7:9 + movq %rax, -24(%rbp) + .loc 1 7 5 is_stmt 0 # tiny.cc:7:5 + jmp .LBB2_3 +.LBB2_2: + .loc 1 0 5 # tiny.cc:0:5 + movabsq $_Z1bv, %rax + .loc 1 9 9 is_stmt 1 # tiny.cc:9:9 + movq %rax, -24(%rbp) +.LBB2_3: + .loc 1 11 3 # tiny.cc:11:3 + callq *-24(%rbp) + .loc 1 12 1 # tiny.cc:12:1 + movl -4(%rbp), %eax + addq $32, %rsp + popq %rbp + retq +.Ltmp5: +.Lfunc_end2: + .size main, .Lfunc_end2-main + .cfi_endproc + # -- End function + .section .debug_str,"MS",@progbits,1 +.Linfo_string0: + .asciz "clang version 6.0.0 (trunk 316774)" # string offset=0 +.Linfo_string1: + .asciz "tiny.cc" # string offset=35 +.Linfo_string2: + .asciz "/tmp/a/b" # string offset=43 + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 0 # DW_CHILDREN_no + .byte 37 # DW_AT_producer + .byte 14 # DW_FORM_strp + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 14 # DW_FORM_strp + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long 38 # Length of Unit + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0x1f DW_TAG_compile_unit + .long .Linfo_string0 # DW_AT_producer + .short 4 # DW_AT_language + .long .Linfo_string1 # DW_AT_name + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Linfo_string2 # DW_AT_comp_dir + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end2-.Lfunc_begin0 # DW_AT_high_pc + .section .debug_ranges,"",@progbits + .section .debug_macinfo,"",@progbits +.Lcu_macro_begin0: + .byte 0 # End Of Macro List Mark + + .ident "clang version 6.0.0 (trunk 316774)" + .section ".note.GNU-stack","",@progbits + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s new file mode 100644 index 00000000000..2d3cf2f484e --- /dev/null +++ b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s @@ -0,0 +1,87 @@ +# Source (tiny.cc): +# void a() {} +# void b() {} +# int main(int argc, char** argv) { +# void(*ptr)(); +# if (argc == 1) +# ptr = &a; +# else +# ptr = &b; +# ptr(); +# } +# Compile with: +# clang++ tiny.cc -S -o tiny.s + + .text + .file "tiny.cc" + .globl _Z1av # -- Begin function _Z1av + .p2align 4, 0x90 + .type _Z1av,@function +_Z1av: # @_Z1av + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp + popq %rbp + retq +.Lfunc_end0: + .size _Z1av, .Lfunc_end0-_Z1av + .cfi_endproc + # -- End function + .globl _Z1bv # -- Begin function _Z1bv + .p2align 4, 0x90 + .type _Z1bv,@function +_Z1bv: # @_Z1bv + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp + popq %rbp + retq +.Lfunc_end1: + .size _Z1bv, .Lfunc_end1-_Z1bv + .cfi_endproc + # -- End function + .globl main # -- Begin function main + .p2align 4, 0x90 + .type main,@function +main: # @main + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp + subq $32, %rsp + movl $0, -4(%rbp) + movl %edi, -8(%rbp) + movq %rsi, -16(%rbp) + cmpl $1, -8(%rbp) + jne .LBB2_2 +# BB#1: + movabsq $_Z1av, %rax + movq %rax, -24(%rbp) + jmp .LBB2_3 +.LBB2_2: + movabsq $_Z1bv, %rax + movq %rax, -24(%rbp) +.LBB2_3: + callq *-24(%rbp) + movl -4(%rbp), %eax + addq $32, %rsp + popq %rbp + retq +.Lfunc_end2: + .size main, .Lfunc_end2-main + .cfi_endproc + # -- End function + + .ident "clang version 6.0.0 (trunk 316774)" + .section ".note.GNU-stack","",@progbits diff --git a/test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s b/test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s new file mode 100644 index 00000000000..fbcfcc2a7cc --- /dev/null +++ b/test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s @@ -0,0 +1,17 @@ +# RUN: llvm-mc %S/Inputs/unprotected-lineinfo.s -filetype obj \ +# RUN: -triple x86_64-linux-elf -o %t.o +# RUN: echo "src:*tiny*" > %t.blacklist.txt +# RUN: llvm-cfi-verify %t.o %t.blacklist.txt | FileCheck %s + +# CHECK-LABEL: U +# CHECK-NEXT: tiny.cc:11 +# CHECK-NEXT: BLACKLIST MATCH, 'src' +# CHECK-NEXT: ====> Expected Unprotected + +# CHECK: Expected Protected: 0 (0.00%) +# CHECK: Unexpected Protected: 0 (0.00%) +# CHECK: Expected Unprotected: 1 (100.00%) +# CHECK: Unexpected Unprotected (BAD): 0 (0.00%) + +# Source: (blacklist.txt): +# src:*tiny* diff --git a/test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s b/test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s new file mode 100644 index 00000000000..3ea829395c4 --- /dev/null +++ b/test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s @@ -0,0 +1,17 @@ +# RUN: llvm-mc %S/Inputs/unprotected-fullinfo.s -filetype obj \ +# RUN: -triple x86_64-linux-elf -o %t.o +# RUN: echo "fun:*main*" > %t.blacklist.txt +# RUN: llvm-cfi-verify %t.o %t.blacklist.txt | FileCheck %s + +# CHECK-LABEL: U +# CHECK-NEXT: tiny.cc:11 +# CHECK-NEXT: BLACKLIST MATCH, 'fun' +# CHECK-NEXT: ====> Expected Unprotected + +# CHECK: Expected Protected: 0 (0.00%) +# CHECK: Unexpected Protected: 0 (0.00%) +# CHECK: Expected Unprotected: 1 (100.00%) +# CHECK: Unexpected Unprotected (BAD): 0 (0.00%) + +# Source: (blacklist.txt): +# fun:*main* diff --git a/test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s b/test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s new file mode 100644 index 00000000000..c6ddf2b5d11 --- /dev/null +++ b/test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s @@ -0,0 +1,17 @@ +# RUN: llvm-mc %S/Inputs/protected-lineinfo.s -filetype obj \ +# RUN: -triple x86_64-linux-elf -o %t.o +# RUN: echo "src:*tiny*" > %t.blacklist.txt +# RUN: llvm-cfi-verify %t.o %t.blacklist.txt | FileCheck %s + +# CHECK-LABEL: P +# CHECK-NEXT: tiny.cc:11 +# CHECK-NEXT: BLACKLIST MATCH, 'src' +# CHECK-NEXT: ====> Unexpected Protected + +# CHECK: Expected Protected: 0 (0.00%) +# CHECK: Unexpected Protected: 1 (100.00%) +# CHECK: Expected Unprotected: 0 (0.00%) +# CHECK: Unexpected Unprotected (BAD): 0 (0.00%) + +# Source: (blacklist.txt): +# src:*tiny* diff --git a/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s b/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s index bf1d87a2eb8..e9b873471cb 100644 --- a/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s +++ b/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s @@ -10,7 +10,10 @@ # reporting of the cfi-verify program. It should only find a single indirect CF # instruction at `tiny.cc:11` (see protected-lineinfo.s for the source). -# CHECK: Unprotected: 0 (0.00%), Protected: 1 (100.00%) +# CHECK: Expected Protected: 1 (100.00%) +# CHECK: Unexpected Protected: 0 (0.00%) +# CHECK: Expected Unprotected: 0 (0.00%) +# CHECK: Unexpected Unprotected (BAD): 0 (0.00%) .text .file "ld-temp.o" diff --git a/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s index e3bb0f7af46..8eaf2e5e725 100644 --- a/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s +++ b/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s @@ -1,203 +1,11 @@ -# RUN: llvm-mc %s -filetype obj -triple x86_64-linux-elf -o %t.o +# RUN: llvm-mc %S/Inputs/protected-lineinfo.s -filetype obj \ +# RUN: -triple x86_64-linux-elf -o %t.o # RUN: llvm-cfi-verify %t.o | FileCheck %s # CHECK-LABEL: P # CHECK-NEXT: tiny.cc:11 -# CHECK: Unprotected: 0 (0.00%), Protected: 1 (100.00%) - -# Source (tiny.cc): -# void a() {} -# void b() {} -# int main(int argc, char** argv) { -# void(*ptr)(); -# if (argc == 1) -# ptr = &a; -# else -# ptr = &b; -# ptr(); -# } -# Compile with (output is in tiny.s.0): -# clang++ -flto -fsanitize=cfi -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt -# clang++ tiny.o -o tiny -flto -fuse-ld=gold -Wl,-plugin-opt,save-temps -# clang++ -fsanitize=cfi -flto -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt -# llvm-lto2 run @tiny.resolution.txt -o tiny.s -filetype=asm - - .text - .file "ld-temp.o" - .p2align 4, 0x90 - .type _Z1av.cfi,@function -_Z1av.cfi: -.Lfunc_begin0: - .file 1 "tiny.cc" - .loc 1 1 0 - .cfi_startproc - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp -.Ltmp0: - .loc 1 1 11 prologue_end - popq %rbp - retq -.Ltmp1: -.Lfunc_end0: - .size _Z1av.cfi, .Lfunc_end0-_Z1av.cfi - .cfi_endproc - - .p2align 4, 0x90 - .type _Z1bv.cfi,@function -_Z1bv.cfi: -.Lfunc_begin1: - .loc 1 2 0 - .cfi_startproc - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp -.Ltmp2: - .loc 1 2 11 prologue_end - popq %rbp - retq -.Ltmp3: -.Lfunc_end1: - .size _Z1bv.cfi, .Lfunc_end1-_Z1bv.cfi - .cfi_endproc - - .hidden main - .globl main - .p2align 4, 0x90 - .type main,@function -main: -.Lfunc_begin2: - .loc 1 4 0 - .cfi_startproc - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp - subq $32, %rsp - movl $0, -8(%rbp) - movl %edi, -4(%rbp) - movq %rsi, -24(%rbp) -.Ltmp4: - .loc 1 6 12 prologue_end - cmpl $1, -4(%rbp) - .loc 1 6 7 is_stmt 0 - jne .LBB2_2 - .loc 1 0 7 - leaq _Z1av(%rip), %rax - .loc 1 7 9 is_stmt 1 - movq %rax, -16(%rbp) - .loc 1 7 5 is_stmt 0 - jmp .LBB2_3 -.LBB2_2: - .loc 1 0 5 - leaq _Z1bv(%rip), %rax - .loc 1 9 9 is_stmt 1 - movq %rax, -16(%rbp) -.LBB2_3: - .loc 1 0 9 is_stmt 0 - leaq .L.cfi.jumptable(%rip), %rcx - .loc 1 11 3 is_stmt 1 - movq -16(%rbp), %rax - movq %rax, %rdx - subq %rcx, %rdx - movq %rdx, %rcx - shrq $3, %rcx - shlq $61, %rdx - orq %rcx, %rdx - cmpq $1, %rdx - jbe .LBB2_5 - ud2 -.LBB2_5: - callq *%rax - .loc 1 12 1 - movl -8(%rbp), %eax - addq $32, %rsp - popq %rbp - retq -.Ltmp5: -.Lfunc_end2: - .size main, .Lfunc_end2-main - .cfi_endproc - - .p2align 3, 0x90 - .type .L.cfi.jumptable,@function -.L.cfi.jumptable: -.Lfunc_begin3: - .cfi_startproc - #APP - jmp _Z1av.cfi@PLT - int3 - int3 - int3 - jmp _Z1bv.cfi@PLT - int3 - int3 - int3 - - #NO_APP -.Lfunc_end3: - .size .L.cfi.jumptable, .Lfunc_end3-.L.cfi.jumptable - .cfi_endproc - - .section .debug_str,"MS",@progbits,1 -.Linfo_string0: - .asciz "clang version 6.0.0 (trunk 316774)" -.Linfo_string1: - .asciz "tiny.cc" -.Linfo_string2: - .asciz "" - .section .debug_abbrev,"",@progbits - .byte 1 - .byte 17 - .byte 0 - .byte 37 - .byte 14 - .byte 19 - .byte 5 - .byte 3 - .byte 14 - .byte 16 - .byte 23 - .byte 27 - .byte 14 - .byte 17 - .byte 1 - .byte 18 - .byte 6 - .byte 0 - .byte 0 - .byte 0 - .section .debug_info,"",@progbits -.Lcu_begin0: - .long 38 - .short 4 - .long .debug_abbrev - .byte 8 - .byte 1 - .long .Linfo_string0 - .short 4 - .long .Linfo_string1 - .long .Lline_table_start0 - .long .Linfo_string2 - .quad .Lfunc_begin0 - .long .Lfunc_end2-.Lfunc_begin0 - .section .debug_ranges,"",@progbits - .section .debug_macinfo,"",@progbits -.Lcu_macro_begin0: - .byte 0 - - .type _Z1av,@function -_Z1av = .L.cfi.jumptable - .type _Z1bv,@function -_Z1bv = .L.cfi.jumptable+8 - .ident "clang version 6.0.0 (trunk 316774)" - .section ".note.GNU-stack","",@progbits - .section .debug_line,"",@progbits -.Lline_table_start0: - +# CHECK: Expected Protected: 1 (100.00%) +# CHECK: Unexpected Protected: 0 (0.00%) +# CHECK: Expected Unprotected: 0 (0.00%) +# CHECK: Unexpected Unprotected (BAD): 0 (0.00%) diff --git a/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s index d8819e16e37..65782cb5e42 100644 --- a/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s +++ b/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s @@ -1,167 +1,11 @@ -# RUN: llvm-mc %s -filetype obj -triple x86_64-linux-elf -o %t.o +# RUN: llvm-mc %S/Inputs/unprotected-lineinfo.s -filetype obj \ +# RUN: -triple x86_64-linux-elf -o %t.o # RUN: llvm-cfi-verify %t.o | FileCheck %s # CHECK-LABEL: U # CHECK-NEXT: tiny.cc:11 -# CHECK: Unprotected: 1 (100.00%), Protected: 0 (0.00%) - -# Source (tiny.cc): -# void a() {} -# void b() {} -# int main(int argc, char** argv) { -# void(*ptr)(); -# if (argc == 1) -# ptr = &a; -# else -# ptr = &b; -# ptr(); -# } -# Compile with: -# clang++ -gmlt tiny.cc -S -o tiny.s - - .text - .file "tiny.cc" - .globl _Z1av # -- Begin function _Z1av - .p2align 4, 0x90 - .type _Z1av,@function -_Z1av: # @_Z1av -.Lfunc_begin0: - .file 1 "tiny.cc" - .loc 1 1 0 # tiny.cc:1:0 - .cfi_startproc -# BB#0: - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp -.Ltmp0: - .loc 1 1 11 prologue_end # tiny.cc:1:11 - popq %rbp - retq -.Ltmp1: -.Lfunc_end0: - .size _Z1av, .Lfunc_end0-_Z1av - .cfi_endproc - # -- End function - .globl _Z1bv # -- Begin function _Z1bv - .p2align 4, 0x90 - .type _Z1bv,@function -_Z1bv: # @_Z1bv -.Lfunc_begin1: - .loc 1 2 0 # tiny.cc:2:0 - .cfi_startproc -# BB#0: - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp -.Ltmp2: - .loc 1 2 11 prologue_end # tiny.cc:2:11 - popq %rbp - retq -.Ltmp3: -.Lfunc_end1: - .size _Z1bv, .Lfunc_end1-_Z1bv - .cfi_endproc - # -- End function - .globl main # -- Begin function main - .p2align 4, 0x90 - .type main,@function -main: # @main -.Lfunc_begin2: - .loc 1 4 0 # tiny.cc:4:0 - .cfi_startproc -# BB#0: - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp - subq $32, %rsp - movl $0, -4(%rbp) - movl %edi, -8(%rbp) - movq %rsi, -16(%rbp) -.Ltmp4: - .loc 1 6 12 prologue_end # tiny.cc:6:12 - cmpl $1, -8(%rbp) - .loc 1 6 7 is_stmt 0 # tiny.cc:6:7 - jne .LBB2_2 -# BB#1: - .loc 1 0 7 # tiny.cc:0:7 - movabsq $_Z1av, %rax - .loc 1 7 9 is_stmt 1 # tiny.cc:7:9 - movq %rax, -24(%rbp) - .loc 1 7 5 is_stmt 0 # tiny.cc:7:5 - jmp .LBB2_3 -.LBB2_2: - .loc 1 0 5 # tiny.cc:0:5 - movabsq $_Z1bv, %rax - .loc 1 9 9 is_stmt 1 # tiny.cc:9:9 - movq %rax, -24(%rbp) -.LBB2_3: - .loc 1 11 3 # tiny.cc:11:3 - callq *-24(%rbp) - .loc 1 12 1 # tiny.cc:12:1 - movl -4(%rbp), %eax - addq $32, %rsp - popq %rbp - retq -.Ltmp5: -.Lfunc_end2: - .size main, .Lfunc_end2-main - .cfi_endproc - # -- End function - .section .debug_str,"MS",@progbits,1 -.Linfo_string0: - .asciz "clang version 6.0.0 (trunk 316774)" # string offset=0 -.Linfo_string1: - .asciz "tiny.cc" # string offset=35 -.Linfo_string2: - .asciz "/tmp/a/b" # string offset=43 - .section .debug_abbrev,"",@progbits - .byte 1 # Abbreviation Code - .byte 17 # DW_TAG_compile_unit - .byte 0 # DW_CHILDREN_no - .byte 37 # DW_AT_producer - .byte 14 # DW_FORM_strp - .byte 19 # DW_AT_language - .byte 5 # DW_FORM_data2 - .byte 3 # DW_AT_name - .byte 14 # DW_FORM_strp - .byte 16 # DW_AT_stmt_list - .byte 23 # DW_FORM_sec_offset - .byte 27 # DW_AT_comp_dir - .byte 14 # DW_FORM_strp - .byte 17 # DW_AT_low_pc - .byte 1 # DW_FORM_addr - .byte 18 # DW_AT_high_pc - .byte 6 # DW_FORM_data4 - .byte 0 # EOM(1) - .byte 0 # EOM(2) - .byte 0 # EOM(3) - .section .debug_info,"",@progbits -.Lcu_begin0: - .long 38 # Length of Unit - .short 4 # DWARF version number - .long .debug_abbrev # Offset Into Abbrev. Section - .byte 8 # Address Size (in bytes) - .byte 1 # Abbrev [1] 0xb:0x1f DW_TAG_compile_unit - .long .Linfo_string0 # DW_AT_producer - .short 4 # DW_AT_language - .long .Linfo_string1 # DW_AT_name - .long .Lline_table_start0 # DW_AT_stmt_list - .long .Linfo_string2 # DW_AT_comp_dir - .quad .Lfunc_begin0 # DW_AT_low_pc - .long .Lfunc_end2-.Lfunc_begin0 # DW_AT_high_pc - .section .debug_ranges,"",@progbits - .section .debug_macinfo,"",@progbits -.Lcu_macro_begin0: - .byte 0 # End Of Macro List Mark - - .ident "clang version 6.0.0 (trunk 316774)" - .section ".note.GNU-stack","",@progbits - .section .debug_line,"",@progbits -.Lline_table_start0: +# CHECK: Expected Protected: 0 (0.00%) +# CHECK: Unexpected Protected: 0 (0.00%) +# CHECK: Expected Unprotected: 0 (0.00%) +# CHECK: Unexpected Unprotected (BAD): 1 (100.00%) diff --git a/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s b/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s index c023a4a84ab..246acf35f5b 100644 --- a/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s +++ b/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s @@ -1,92 +1,5 @@ -# RUN: llvm-mc %s -filetype obj -triple x86_64-linux-elf -o %t.o +# RUN: llvm-mc %S/Inputs/unprotected-nolineinfo.s -filetype obj \ +# RUN: -triple x86_64-linux-elf -o %t.o # RUN: not llvm-cfi-verify %t.o 2>&1 | FileCheck %s # CHECK: DWARF line information missing. Did you compile with '-g'? - -# Source (tiny.cc): -# void a() {} -# void b() {} -# int main(int argc, char** argv) { -# void(*ptr)(); -# if (argc == 1) -# ptr = &a; -# else -# ptr = &b; -# ptr(); -# } -# Compile with: -# clang++ tiny.cc -S -o tiny.s - - .text - .file "tiny.cc" - .globl _Z1av # -- Begin function _Z1av - .p2align 4, 0x90 - .type _Z1av,@function -_Z1av: # @_Z1av - .cfi_startproc -# BB#0: - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp - popq %rbp - retq -.Lfunc_end0: - .size _Z1av, .Lfunc_end0-_Z1av - .cfi_endproc - # -- End function - .globl _Z1bv # -- Begin function _Z1bv - .p2align 4, 0x90 - .type _Z1bv,@function -_Z1bv: # @_Z1bv - .cfi_startproc -# BB#0: - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp - popq %rbp - retq -.Lfunc_end1: - .size _Z1bv, .Lfunc_end1-_Z1bv - .cfi_endproc - # -- End function - .globl main # -- Begin function main - .p2align 4, 0x90 - .type main,@function -main: # @main - .cfi_startproc -# BB#0: - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp - subq $32, %rsp - movl $0, -4(%rbp) - movl %edi, -8(%rbp) - movq %rsi, -16(%rbp) - cmpl $1, -8(%rbp) - jne .LBB2_2 -# BB#1: - movabsq $_Z1av, %rax - movq %rax, -24(%rbp) - jmp .LBB2_3 -.LBB2_2: - movabsq $_Z1bv, %rax - movq %rax, -24(%rbp) -.LBB2_3: - callq *-24(%rbp) - movl -4(%rbp), %eax - addq $32, %rsp - popq %rbp - retq -.Lfunc_end2: - .size main, .Lfunc_end2-main - .cfi_endproc - # -- End function - - .ident "clang version 6.0.0 (trunk 316774)" - .section ".note.GNU-stack","",@progbits diff --git a/tools/llvm-cfi-verify/CMakeLists.txt b/tools/llvm-cfi-verify/CMakeLists.txt index 07c6504bf48..de6a46e7859 100644 --- a/tools/llvm-cfi-verify/CMakeLists.txt +++ b/tools/llvm-cfi-verify/CMakeLists.txt @@ -4,11 +4,11 @@ set(LLVM_LINK_COMPONENTS AllTargetsDescs AllTargetsDisassemblers AllTargetsInfos - DebugInfoDWARF MC MCParser Object Support + Symbolize ) add_llvm_tool(llvm-cfi-verify diff --git a/tools/llvm-cfi-verify/LLVMBuild.txt b/tools/llvm-cfi-verify/LLVMBuild.txt index 5c4ce263090..d5e93230272 100644 --- a/tools/llvm-cfi-verify/LLVMBuild.txt +++ b/tools/llvm-cfi-verify/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Tool name = llvm-cfi-verify parent = Tools -required_libraries = all-targets DebugInfoDWARF MC MCDisassembler MCParser Support +required_libraries = all-targets MC MCDisassembler MCParser Support Symbolize diff --git a/tools/llvm-cfi-verify/lib/CMakeLists.txt b/tools/llvm-cfi-verify/lib/CMakeLists.txt index c90e4ed485e..030bfa5d6c7 100644 --- a/tools/llvm-cfi-verify/lib/CMakeLists.txt +++ b/tools/llvm-cfi-verify/lib/CMakeLists.txt @@ -11,5 +11,6 @@ llvm_map_components_to_libnames(libs MC MCParser Object - Support) + Support + Symbolize) target_link_libraries(LLVMCFIVerify ${libs}) diff --git a/tools/llvm-cfi-verify/lib/FileAnalysis.cpp b/tools/llvm-cfi-verify/lib/FileAnalysis.cpp index 278e861dfd3..0d4e1f497ff 100644 --- a/tools/llvm-cfi-verify/lib/FileAnalysis.cpp +++ b/tools/llvm-cfi-verify/lib/FileAnalysis.cpp @@ -39,22 +39,20 @@ #include using Instr = llvm::cfi_verify::FileAnalysis::Instr; +using LLVMSymbolizer = llvm::symbolize::LLVMSymbolizer; namespace llvm { namespace cfi_verify { -static cl::opt IgnoreDWARF( +bool IgnoreDWARFFlag; + +static cl::opt IgnoreDWARFArg( "ignore-dwarf", cl::desc( "Ignore all DWARF data. This relaxes the requirements for all " "statically linked libraries to have been compiled with '-g', but " "will result in false positives for 'CFI unprotected' instructions."), - cl::init(false)); - -cl::opt DWARFSearchRange( - "dwarf-search-range", - cl::desc("Address search range used to determine if instruction is valid."), - cl::init(0x10)); + cl::location(IgnoreDWARFFlag), cl::init(false)); Expected FileAnalysis::Create(StringRef Filename) { // Open the filename provided. @@ -256,12 +254,16 @@ const MCInstrAnalysis *FileAnalysis::getMCInstrAnalysis() const { return MIA.get(); } +LLVMSymbolizer &FileAnalysis::getSymbolizer() { return *Symbolizer; } + Error FileAnalysis::initialiseDisassemblyMembers() { std::string TripleName = ObjectTriple.getTriple(); ArchName = ""; MCPU = ""; std::string ErrorString; + Symbolizer.reset(new LLVMSymbolizer()); + ObjectTarget = TargetRegistry::lookupTarget(ArchName, ObjectTriple, ErrorString); if (!ObjectTarget) @@ -308,8 +310,8 @@ Error FileAnalysis::initialiseDisassemblyMembers() { } Error FileAnalysis::parseCodeSections() { - if (!IgnoreDWARF) { - DWARF.reset(DWARFContext::create(*Object).release()); + if (!IgnoreDWARFFlag) { + std::unique_ptr DWARF = DWARFContext::create(*Object); if (!DWARF) return make_error("Could not create DWARF information.", inconvertibleErrorCode()); @@ -347,21 +349,9 @@ Error FileAnalysis::parseCodeSections() { return Error::success(); } -DILineInfoTable FileAnalysis::getLineInfoForAddressRange(uint64_t Address) { - if (!hasLineTableInfo()) - return DILineInfoTable(); - - return DWARF->getLineInfoForAddressRange(Address, DWARFSearchRange); -} - -bool FileAnalysis::hasValidLineInfoForAddressRange(uint64_t Address) { - return !getLineInfoForAddressRange(Address).empty(); -} - -bool FileAnalysis::hasLineTableInfo() const { return DWARF != nullptr; } - void FileAnalysis::parseSectionContents(ArrayRef SectionBytes, uint64_t SectionAddress) { + assert(Symbolizer && "Symbolizer is uninitialised."); MCInst Instruction; Instr InstrMeta; uint64_t InstructionSize; @@ -381,8 +371,19 @@ void FileAnalysis::parseSectionContents(ArrayRef SectionBytes, InstrMeta.Valid = ValidInstruction; // Check if this instruction exists in the range of the DWARF metadata. - if (hasLineTableInfo() && !hasValidLineInfoForAddressRange(VMAddress)) - continue; + if (!IgnoreDWARFFlag) { + auto LineInfo = + Symbolizer->symbolizeCode(Object->getFileName(), VMAddress); + if (!LineInfo) { + handleAllErrors(LineInfo.takeError(), [](const ErrorInfoBase &E) { + errs() << "Symbolizer failed to get line: " << E.message() << "\n"; + }); + continue; + } + + if (LineInfo->FileName == "") + continue; + } addInstruction(InstrMeta); diff --git a/tools/llvm-cfi-verify/lib/FileAnalysis.h b/tools/llvm-cfi-verify/lib/FileAnalysis.h index 9945a2110a2..e0eecb037c3 100644 --- a/tools/llvm-cfi-verify/lib/FileAnalysis.h +++ b/tools/llvm-cfi-verify/lib/FileAnalysis.h @@ -12,7 +12,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/BinaryFormat/ELF.h" -#include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/DebugInfo/Symbolize/Symbolize.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" @@ -44,6 +44,8 @@ namespace llvm { namespace cfi_verify { +extern bool IgnoreDWARFFlag; + // Disassembler and analysis tool for machine code files. Keeps track of non- // sequential control flows, including indirect control flow instructions. class FileAnalysis { @@ -120,6 +122,7 @@ public: const MCRegisterInfo *getRegisterInfo() const; const MCInstrInfo *getMCInstrInfo() const; const MCInstrAnalysis *getMCInstrAnalysis() const; + symbolize::LLVMSymbolizer &getSymbolizer(); // Returns true if this class is using DWARF line tables for elimination. bool hasLineTableInfo() const; @@ -175,8 +178,8 @@ private: std::unique_ptr MIA; std::unique_ptr Printer; - // DWARF debug information. - std::unique_ptr DWARF; + // Symbolizer used for debug information parsing. + std::unique_ptr Symbolizer; // A mapping between the virtual memory address to the instruction metadata // struct. TODO(hctim): Reimplement this as a sorted vector to avoid per- diff --git a/tools/llvm-cfi-verify/lib/LLVMBuild.txt b/tools/llvm-cfi-verify/lib/LLVMBuild.txt index 99b678fc88a..c0ae1905521 100644 --- a/tools/llvm-cfi-verify/lib/LLVMBuild.txt +++ b/tools/llvm-cfi-verify/lib/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Library name = CFIVerify parent = Libraries -required_libraries = DebugInfoDWARF MC MCDisassembler MCParser Support +required_libraries = DebugInfoDWARF MC MCDisassembler MCParser Support Symbolize diff --git a/tools/llvm-cfi-verify/llvm-cfi-verify.cpp b/tools/llvm-cfi-verify/llvm-cfi-verify.cpp index d4a46fcc226..a3c202f53bb 100644 --- a/tools/llvm-cfi-verify/llvm-cfi-verify.cpp +++ b/tools/llvm-cfi-verify/llvm-cfi-verify.cpp @@ -23,6 +23,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/SpecialCaseList.h" #include @@ -32,48 +33,120 @@ using namespace llvm::cfi_verify; cl::opt InputFilename(cl::Positional, cl::desc(""), cl::Required); +cl::opt BlacklistFilename(cl::Positional, + cl::desc("[blacklist file]"), + cl::init("-")); ExitOnError ExitOnErr; -void printIndirectCFInstructions(FileAnalysis &Analysis) { - uint64_t ProtectedCount = 0; - uint64_t UnprotectedCount = 0; +void printIndirectCFInstructions(FileAnalysis &Analysis, + const SpecialCaseList *SpecialCaseList) { + uint64_t ExpectedProtected = 0; + uint64_t UnexpectedProtected = 0; + uint64_t ExpectedUnprotected = 0; + uint64_t UnexpectedUnprotected = 0; + + symbolize::LLVMSymbolizer &Symbolizer = Analysis.getSymbolizer(); for (uint64_t Address : Analysis.getIndirectInstructions()) { const auto &InstrMeta = Analysis.getInstructionOrDie(Address); - if (Analysis.isIndirectInstructionCFIProtected(Address)) { + bool CFIProtected = Analysis.isIndirectInstructionCFIProtected(Address); + + if (CFIProtected) outs() << "P "; - ProtectedCount++; - } else { + else outs() << "U "; - UnprotectedCount++; - } outs() << format_hex(Address, 2) << " | " << Analysis.getMCInstrInfo()->getName( InstrMeta.Instruction.getOpcode()) - << " "; - outs() << "\n"; - - if (Analysis.hasLineTableInfo()) { - for (const auto &LineKV : Analysis.getLineInfoForAddressRange(Address)) { - outs() << " " << format_hex(LineKV.first, 2) << " = " - << LineKV.second.FileName << ":" << LineKV.second.Line << ":" - << LineKV.second.Column << " (" << LineKV.second.FunctionName - << ")\n"; + << " \n"; + + if (IgnoreDWARFFlag) { + if (CFIProtected) + ExpectedProtected++; + else + UnexpectedUnprotected++; + continue; + } + + auto InliningInfo = Symbolizer.symbolizeInlinedCode(InputFilename, Address); + if (!InliningInfo || InliningInfo->getNumberOfFrames() == 0) { + errs() << "Failed to symbolise " << format_hex(Address, 2) + << " with line tables from " << InputFilename << "\n"; + exit(EXIT_FAILURE); + } + + const auto &LineInfo = + InliningInfo->getFrame(InliningInfo->getNumberOfFrames() - 1); + + // Print the inlining symbolisation of this instruction. + for (uint32_t i = 0; i < InliningInfo->getNumberOfFrames(); ++i) { + const auto &Line = InliningInfo->getFrame(i); + outs() << " " << format_hex(Address, 2) << " = " << Line.FileName << ":" + << Line.Line << ":" << Line.Column << " (" << Line.FunctionName + << ")\n"; + } + + if (!SpecialCaseList) { + if (CFIProtected) + ExpectedProtected++; + else + UnexpectedUnprotected++; + continue; + } + + bool MatchesBlacklistRule = false; + if (SpecialCaseList->inSection("cfi-icall", "src", LineInfo.FileName) || + SpecialCaseList->inSection("cfi-vcall", "src", LineInfo.FileName)) { + outs() << "BLACKLIST MATCH, 'src'\n"; + MatchesBlacklistRule = true; + } + + if (SpecialCaseList->inSection("cfi-icall", "fun", LineInfo.FunctionName) || + SpecialCaseList->inSection("cfi-vcall", "fun", LineInfo.FunctionName)) { + outs() << "BLACKLIST MATCH, 'fun'\n"; + MatchesBlacklistRule = true; + } + + if (MatchesBlacklistRule) { + if (CFIProtected) { + UnexpectedProtected++; + outs() << "====> Unexpected Protected\n"; + } else { + ExpectedUnprotected++; + outs() << "====> Expected Unprotected\n"; + } + } else { + if (CFIProtected) { + ExpectedProtected++; + outs() << "====> Expected Protected\n"; + } else { + UnexpectedUnprotected++; + outs() << "====> Unexpected Unprotected\n"; } } } - if (ProtectedCount || UnprotectedCount) - outs() << formatv( - "Unprotected: {0} ({1:P}), Protected: {2} ({3:P})\n", UnprotectedCount, - (((double)UnprotectedCount) / (UnprotectedCount + ProtectedCount)), - ProtectedCount, - (((double)ProtectedCount) / (UnprotectedCount + ProtectedCount))); - else + uint64_t IndirectCFInstructions = ExpectedProtected + UnexpectedProtected + + ExpectedUnprotected + UnexpectedUnprotected; + + if (IndirectCFInstructions == 0) outs() << "No indirect CF instructions found.\n"; + + outs() << formatv("Expected Protected: {0} ({1:P})\n" + "Unexpected Protected: {2} ({3:P})\n" + "Expected Unprotected: {4} ({5:P})\n" + "Unexpected Unprotected (BAD): {6} ({7:P})\n", + ExpectedProtected, + ((double)ExpectedProtected) / IndirectCFInstructions, + UnexpectedProtected, + ((double)UnexpectedProtected) / IndirectCFInstructions, + ExpectedUnprotected, + ((double)ExpectedUnprotected) / IndirectCFInstructions, + UnexpectedUnprotected, + ((double)UnexpectedUnprotected) / IndirectCFInstructions); } int main(int argc, char **argv) { @@ -89,8 +162,18 @@ int main(int argc, char **argv) { InitializeAllAsmParsers(); InitializeAllDisassemblers(); + std::unique_ptr SpecialCaseList; + if (BlacklistFilename != "-") { + std::string Error; + SpecialCaseList = SpecialCaseList::create({BlacklistFilename}, Error); + if (!SpecialCaseList) { + errs() << "Failed to get blacklist: " << Error << "\n"; + exit(EXIT_FAILURE); + } + } + FileAnalysis Analysis = ExitOnErr(FileAnalysis::Create(InputFilename)); - printIndirectCFInstructions(Analysis); + printIndirectCFInstructions(Analysis, SpecialCaseList.get()); return EXIT_SUCCESS; } diff --git a/unittests/tools/llvm-cfi-verify/CMakeLists.txt b/unittests/tools/llvm-cfi-verify/CMakeLists.txt index ad3266c2777..adb7a55327a 100644 --- a/unittests/tools/llvm-cfi-verify/CMakeLists.txt +++ b/unittests/tools/llvm-cfi-verify/CMakeLists.txt @@ -8,6 +8,7 @@ set(LLVM_LINK_COMPONENTS MCParser Object Support + Symbolize ) add_llvm_unittest(CFIVerifyTests diff --git a/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp b/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp index a3da1fc3f56..00346ab5a14 100644 --- a/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp +++ b/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp @@ -64,6 +64,7 @@ public: class BasicFileAnalysisTest : public ::testing::Test { protected: virtual void SetUp() { + IgnoreDWARFFlag = true; SuccessfullyInitialised = true; if (auto Err = Analysis.initialiseDisassemblyMembers()) { handleAllErrors(std::move(Err), [&](const UnsupportedDisassembly &E) { diff --git a/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp b/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp index b200677dd09..a7d09b54781 100644 --- a/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp +++ b/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp @@ -126,6 +126,7 @@ public: class BasicGraphBuilderTest : public ::testing::Test { protected: virtual void SetUp() { + IgnoreDWARFFlag = true; SuccessfullyInitialised = true; if (auto Err = Analysis.initialiseDisassemblyMembers()) { handleAllErrors(std::move(Err), [&](const UnsupportedDisassembly &E) { -- cgit v1.2.3 From 352adf2ec9cb3ee1fc07370e4d6b34028dd80bf3 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Fri, 3 Nov 2017 20:57:09 +0000 Subject: llvm-objdump: Fix unused-lambda-capture warning by removing unused lambda capture git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317365 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-objcopy/llvm-objcopy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp index 52091d3e183..5a09f8f18db 100644 --- a/tools/llvm-objcopy/llvm-objcopy.cpp +++ b/tools/llvm-objcopy/llvm-objcopy.cpp @@ -160,7 +160,7 @@ void CopyBinary(const ELFObjectFile &ObjFile) { } if (StripDWO || !SplitDWO.empty()) - RemovePred = [RemovePred, &Obj](const SectionBase &Sec) { + RemovePred = [RemovePred](const SectionBase &Sec) { return IsDWOSection(Sec) || RemovePred(Sec); }; -- cgit v1.2.3 From 7711c315b294abaa47e3933ec470e04fa5b8ae80 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Fri, 3 Nov 2017 20:57:10 +0000 Subject: GCOV: Move GCOV from IR & Support into ProfileData to fix layering This class was split between libIR and libSupport, which breaks under modular code generation. Move it into the one library that uses it, ProfileData, to resolve this issue. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317366 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ProfileData/GCOV.h | 460 ++++++++++++++++ include/llvm/ProfileData/SampleProfReader.h | 2 +- include/llvm/Support/GCOV.h | 460 ---------------- lib/IR/CMakeLists.txt | 1 - lib/IR/GCOV.cpp | 821 ---------------------------- lib/ProfileData/CMakeLists.txt | 1 + lib/ProfileData/GCOV.cpp | 821 ++++++++++++++++++++++++++++ tools/llvm-cov/gcov.cpp | 2 +- 8 files changed, 1284 insertions(+), 1284 deletions(-) create mode 100644 include/llvm/ProfileData/GCOV.h delete mode 100644 include/llvm/Support/GCOV.h delete mode 100644 lib/IR/GCOV.cpp create mode 100644 lib/ProfileData/GCOV.cpp diff --git a/include/llvm/ProfileData/GCOV.h b/include/llvm/ProfileData/GCOV.h new file mode 100644 index 00000000000..497f80b87b2 --- /dev/null +++ b/include/llvm/ProfileData/GCOV.h @@ -0,0 +1,460 @@ +//===- GCOV.h - LLVM coverage tool ------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This header provides the interface to read and write coverage files that +// use 'gcov' format. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_PROFILEDATA_GCOV_H +#define LLVM_PROFILEDATA_GCOV_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/iterator.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include +#include +#include + +namespace llvm { + +class GCOVFunction; +class GCOVBlock; +class FileInfo; + +namespace GCOV { + +enum GCOVVersion { V402, V404, V704 }; + +/// \brief A struct for passing gcov options between functions. +struct Options { + Options(bool A, bool B, bool C, bool F, bool P, bool U, bool L, bool N) + : AllBlocks(A), BranchInfo(B), BranchCount(C), FuncCoverage(F), + PreservePaths(P), UncondBranch(U), LongFileNames(L), NoOutput(N) {} + + bool AllBlocks; + bool BranchInfo; + bool BranchCount; + bool FuncCoverage; + bool PreservePaths; + bool UncondBranch; + bool LongFileNames; + bool NoOutput; +}; + +} // end namespace GCOV + +/// GCOVBuffer - A wrapper around MemoryBuffer to provide GCOV specific +/// read operations. +class GCOVBuffer { +public: + GCOVBuffer(MemoryBuffer *B) : Buffer(B) {} + + /// readGCNOFormat - Check GCNO signature is valid at the beginning of buffer. + bool readGCNOFormat() { + StringRef File = Buffer->getBuffer().slice(0, 4); + if (File != "oncg") { + errs() << "Unexpected file type: " << File << ".\n"; + return false; + } + Cursor = 4; + return true; + } + + /// readGCDAFormat - Check GCDA signature is valid at the beginning of buffer. + bool readGCDAFormat() { + StringRef File = Buffer->getBuffer().slice(0, 4); + if (File != "adcg") { + errs() << "Unexpected file type: " << File << ".\n"; + return false; + } + Cursor = 4; + return true; + } + + /// readGCOVVersion - Read GCOV version. + bool readGCOVVersion(GCOV::GCOVVersion &Version) { + StringRef VersionStr = Buffer->getBuffer().slice(Cursor, Cursor + 4); + if (VersionStr == "*204") { + Cursor += 4; + Version = GCOV::V402; + return true; + } + if (VersionStr == "*404") { + Cursor += 4; + Version = GCOV::V404; + return true; + } + if (VersionStr == "*704") { + Cursor += 4; + Version = GCOV::V704; + return true; + } + errs() << "Unexpected version: " << VersionStr << ".\n"; + return false; + } + + /// readFunctionTag - If cursor points to a function tag then increment the + /// cursor and return true otherwise return false. + bool readFunctionTag() { + StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4); + if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\0' || + Tag[3] != '\1') { + return false; + } + Cursor += 4; + return true; + } + + /// readBlockTag - If cursor points to a block tag then increment the + /// cursor and return true otherwise return false. + bool readBlockTag() { + StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4); + if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\x41' || + Tag[3] != '\x01') { + return false; + } + Cursor += 4; + return true; + } + + /// readEdgeTag - If cursor points to an edge tag then increment the + /// cursor and return true otherwise return false. + bool readEdgeTag() { + StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4); + if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\x43' || + Tag[3] != '\x01') { + return false; + } + Cursor += 4; + return true; + } + + /// readLineTag - If cursor points to a line tag then increment the + /// cursor and return true otherwise return false. + bool readLineTag() { + StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4); + if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\x45' || + Tag[3] != '\x01') { + return false; + } + Cursor += 4; + return true; + } + + /// readArcTag - If cursor points to an gcda arc tag then increment the + /// cursor and return true otherwise return false. + bool readArcTag() { + StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4); + if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\xa1' || + Tag[3] != '\1') { + return false; + } + Cursor += 4; + return true; + } + + /// readObjectTag - If cursor points to an object summary tag then increment + /// the cursor and return true otherwise return false. + bool readObjectTag() { + StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4); + if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\0' || + Tag[3] != '\xa1') { + return false; + } + Cursor += 4; + return true; + } + + /// readProgramTag - If cursor points to a program summary tag then increment + /// the cursor and return true otherwise return false. + bool readProgramTag() { + StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4); + if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\0' || + Tag[3] != '\xa3') { + return false; + } + Cursor += 4; + return true; + } + + bool readInt(uint32_t &Val) { + if (Buffer->getBuffer().size() < Cursor + 4) { + errs() << "Unexpected end of memory buffer: " << Cursor + 4 << ".\n"; + return false; + } + StringRef Str = Buffer->getBuffer().slice(Cursor, Cursor + 4); + Cursor += 4; + Val = *(const uint32_t *)(Str.data()); + return true; + } + + bool readInt64(uint64_t &Val) { + uint32_t Lo, Hi; + if (!readInt(Lo) || !readInt(Hi)) + return false; + Val = ((uint64_t)Hi << 32) | Lo; + return true; + } + + bool readString(StringRef &Str) { + uint32_t Len = 0; + // Keep reading until we find a non-zero length. This emulates gcov's + // behaviour, which appears to do the same. + while (Len == 0) + if (!readInt(Len)) + return false; + Len *= 4; + if (Buffer->getBuffer().size() < Cursor + Len) { + errs() << "Unexpected end of memory buffer: " << Cursor + Len << ".\n"; + return false; + } + Str = Buffer->getBuffer().slice(Cursor, Cursor + Len).split('\0').first; + Cursor += Len; + return true; + } + + uint64_t getCursor() const { return Cursor; } + void advanceCursor(uint32_t n) { Cursor += n * 4; } + +private: + MemoryBuffer *Buffer; + uint64_t Cursor = 0; +}; + +/// GCOVFile - Collects coverage information for one pair of coverage file +/// (.gcno and .gcda). +class GCOVFile { +public: + GCOVFile() = default; + + bool readGCNO(GCOVBuffer &Buffer); + bool readGCDA(GCOVBuffer &Buffer); + uint32_t getChecksum() const { return Checksum; } + void print(raw_ostream &OS) const; + void dump() const; + void collectLineCounts(FileInfo &FI); + +private: + bool GCNOInitialized = false; + GCOV::GCOVVersion Version; + uint32_t Checksum = 0; + SmallVector, 16> Functions; + uint32_t RunCount = 0; + uint32_t ProgramCount = 0; +}; + +/// GCOVEdge - Collects edge information. +struct GCOVEdge { + GCOVEdge(GCOVBlock &S, GCOVBlock &D) : Src(S), Dst(D) {} + + GCOVBlock &Src; + GCOVBlock &Dst; + uint64_t Count = 0; +}; + +/// GCOVFunction - Collects function information. +class GCOVFunction { +public: + using BlockIterator = pointee_iterator>::const_iterator>; + + GCOVFunction(GCOVFile &P) : Parent(P) {} + + bool readGCNO(GCOVBuffer &Buffer, GCOV::GCOVVersion Version); + bool readGCDA(GCOVBuffer &Buffer, GCOV::GCOVVersion Version); + StringRef getName() const { return Name; } + StringRef getFilename() const { return Filename; } + size_t getNumBlocks() const { return Blocks.size(); } + uint64_t getEntryCount() const; + uint64_t getExitCount() const; + + BlockIterator block_begin() const { return Blocks.begin(); } + BlockIterator block_end() const { return Blocks.end(); } + iterator_range blocks() const { + return make_range(block_begin(), block_end()); + } + + void print(raw_ostream &OS) const; + void dump() const; + void collectLineCounts(FileInfo &FI); + +private: + GCOVFile &Parent; + uint32_t Ident = 0; + uint32_t Checksum; + uint32_t LineNumber = 0; + StringRef Name; + StringRef Filename; + SmallVector, 16> Blocks; + SmallVector, 16> Edges; +}; + +/// GCOVBlock - Collects block information. +class GCOVBlock { + struct EdgeWeight { + EdgeWeight(GCOVBlock *D) : Dst(D) {} + + GCOVBlock *Dst; + uint64_t Count = 0; + }; + + struct SortDstEdgesFunctor { + bool operator()(const GCOVEdge *E1, const GCOVEdge *E2) { + return E1->Dst.Number < E2->Dst.Number; + } + }; + +public: + using EdgeIterator = SmallVectorImpl::const_iterator; + + GCOVBlock(GCOVFunction &P, uint32_t N) : Parent(P), Number(N) {} + ~GCOVBlock(); + + const GCOVFunction &getParent() const { return Parent; } + void addLine(uint32_t N) { Lines.push_back(N); } + uint32_t getLastLine() const { return Lines.back(); } + void addCount(size_t DstEdgeNo, uint64_t N); + uint64_t getCount() const { return Counter; } + + void addSrcEdge(GCOVEdge *Edge) { + assert(&Edge->Dst == this); // up to caller to ensure edge is valid + SrcEdges.push_back(Edge); + } + + void addDstEdge(GCOVEdge *Edge) { + assert(&Edge->Src == this); // up to caller to ensure edge is valid + // Check if adding this edge causes list to become unsorted. + if (DstEdges.size() && DstEdges.back()->Dst.Number > Edge->Dst.Number) + DstEdgesAreSorted = false; + DstEdges.push_back(Edge); + } + + size_t getNumSrcEdges() const { return SrcEdges.size(); } + size_t getNumDstEdges() const { return DstEdges.size(); } + void sortDstEdges(); + + EdgeIterator src_begin() const { return SrcEdges.begin(); } + EdgeIterator src_end() const { return SrcEdges.end(); } + iterator_range srcs() const { + return make_range(src_begin(), src_end()); + } + + EdgeIterator dst_begin() const { return DstEdges.begin(); } + EdgeIterator dst_end() const { return DstEdges.end(); } + iterator_range dsts() const { + return make_range(dst_begin(), dst_end()); + } + + void print(raw_ostream &OS) const; + void dump() const; + void collectLineCounts(FileInfo &FI); + +private: + GCOVFunction &Parent; + uint32_t Number; + uint64_t Counter = 0; + bool DstEdgesAreSorted = true; + SmallVector SrcEdges; + SmallVector DstEdges; + SmallVector Lines; +}; + +class FileInfo { + // It is unlikely--but possible--for multiple functions to be on the same + // line. + // Therefore this typedef allows LineData.Functions to store multiple + // functions + // per instance. This is rare, however, so optimize for the common case. + using FunctionVector = SmallVector; + using FunctionLines = DenseMap; + using BlockVector = SmallVector; + using BlockLines = DenseMap; + + struct LineData { + LineData() = default; + + BlockLines Blocks; + FunctionLines Functions; + uint32_t LastLine = 0; + }; + + struct GCOVCoverage { + GCOVCoverage(StringRef Name) : Name(Name) {} + + StringRef Name; + + uint32_t LogicalLines = 0; + uint32_t LinesExec = 0; + + uint32_t Branches = 0; + uint32_t BranchesExec = 0; + uint32_t BranchesTaken = 0; + }; + +public: + FileInfo(const GCOV::Options &Options) : Options(Options) {} + + void addBlockLine(StringRef Filename, uint32_t Line, const GCOVBlock *Block) { + if (Line > LineInfo[Filename].LastLine) + LineInfo[Filename].LastLine = Line; + LineInfo[Filename].Blocks[Line - 1].push_back(Block); + } + + void addFunctionLine(StringRef Filename, uint32_t Line, + const GCOVFunction *Function) { + if (Line > LineInfo[Filename].LastLine) + LineInfo[Filename].LastLine = Line; + LineInfo[Filename].Functions[Line - 1].push_back(Function); + } + + void setRunCount(uint32_t Runs) { RunCount = Runs; } + void setProgramCount(uint32_t Programs) { ProgramCount = Programs; } + void print(raw_ostream &OS, StringRef MainFilename, StringRef GCNOFile, + StringRef GCDAFile); + +private: + std::string getCoveragePath(StringRef Filename, StringRef MainFilename); + std::unique_ptr openCoveragePath(StringRef CoveragePath); + void printFunctionSummary(raw_ostream &OS, const FunctionVector &Funcs) const; + void printBlockInfo(raw_ostream &OS, const GCOVBlock &Block, + uint32_t LineIndex, uint32_t &BlockNo) const; + void printBranchInfo(raw_ostream &OS, const GCOVBlock &Block, + GCOVCoverage &Coverage, uint32_t &EdgeNo); + void printUncondBranchInfo(raw_ostream &OS, uint32_t &EdgeNo, + uint64_t Count) const; + + void printCoverage(raw_ostream &OS, const GCOVCoverage &Coverage) const; + void printFuncCoverage(raw_ostream &OS) const; + void printFileCoverage(raw_ostream &OS) const; + + const GCOV::Options &Options; + StringMap LineInfo; + uint32_t RunCount = 0; + uint32_t ProgramCount = 0; + + using FileCoverageList = SmallVector, 4>; + using FuncCoverageMap = MapVector; + + FileCoverageList FileCoverages; + FuncCoverageMap FuncCoverages; +}; + +} // end namespace llvm + +#endif // LLVM_SUPPORT_GCOV_H diff --git a/include/llvm/ProfileData/SampleProfReader.h b/include/llvm/ProfileData/SampleProfReader.h index 9c1f357cbbd..0e9ab2dc60e 100644 --- a/include/llvm/ProfileData/SampleProfReader.h +++ b/include/llvm/ProfileData/SampleProfReader.h @@ -217,10 +217,10 @@ #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/ProfileSummary.h" +#include "llvm/ProfileData/GCOV.h" #include "llvm/ProfileData/SampleProf.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorOr.h" -#include "llvm/Support/GCOV.h" #include "llvm/Support/MemoryBuffer.h" #include #include diff --git a/include/llvm/Support/GCOV.h b/include/llvm/Support/GCOV.h deleted file mode 100644 index 02016e7dbd6..00000000000 --- a/include/llvm/Support/GCOV.h +++ /dev/null @@ -1,460 +0,0 @@ -//===- GCOV.h - LLVM coverage tool ------------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This header provides the interface to read and write coverage files that -// use 'gcov' format. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_SUPPORT_GCOV_H -#define LLVM_SUPPORT_GCOV_H - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/MapVector.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringMap.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/ADT/iterator.h" -#include "llvm/ADT/iterator_range.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/raw_ostream.h" -#include -#include -#include -#include -#include -#include - -namespace llvm { - -class GCOVFunction; -class GCOVBlock; -class FileInfo; - -namespace GCOV { - -enum GCOVVersion { V402, V404, V704 }; - -/// \brief A struct for passing gcov options between functions. -struct Options { - Options(bool A, bool B, bool C, bool F, bool P, bool U, bool L, bool N) - : AllBlocks(A), BranchInfo(B), BranchCount(C), FuncCoverage(F), - PreservePaths(P), UncondBranch(U), LongFileNames(L), NoOutput(N) {} - - bool AllBlocks; - bool BranchInfo; - bool BranchCount; - bool FuncCoverage; - bool PreservePaths; - bool UncondBranch; - bool LongFileNames; - bool NoOutput; -}; - -} // end namespace GCOV - -/// GCOVBuffer - A wrapper around MemoryBuffer to provide GCOV specific -/// read operations. -class GCOVBuffer { -public: - GCOVBuffer(MemoryBuffer *B) : Buffer(B) {} - - /// readGCNOFormat - Check GCNO signature is valid at the beginning of buffer. - bool readGCNOFormat() { - StringRef File = Buffer->getBuffer().slice(0, 4); - if (File != "oncg") { - errs() << "Unexpected file type: " << File << ".\n"; - return false; - } - Cursor = 4; - return true; - } - - /// readGCDAFormat - Check GCDA signature is valid at the beginning of buffer. - bool readGCDAFormat() { - StringRef File = Buffer->getBuffer().slice(0, 4); - if (File != "adcg") { - errs() << "Unexpected file type: " << File << ".\n"; - return false; - } - Cursor = 4; - return true; - } - - /// readGCOVVersion - Read GCOV version. - bool readGCOVVersion(GCOV::GCOVVersion &Version) { - StringRef VersionStr = Buffer->getBuffer().slice(Cursor, Cursor + 4); - if (VersionStr == "*204") { - Cursor += 4; - Version = GCOV::V402; - return true; - } - if (VersionStr == "*404") { - Cursor += 4; - Version = GCOV::V404; - return true; - } - if (VersionStr == "*704") { - Cursor += 4; - Version = GCOV::V704; - return true; - } - errs() << "Unexpected version: " << VersionStr << ".\n"; - return false; - } - - /// readFunctionTag - If cursor points to a function tag then increment the - /// cursor and return true otherwise return false. - bool readFunctionTag() { - StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4); - if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\0' || - Tag[3] != '\1') { - return false; - } - Cursor += 4; - return true; - } - - /// readBlockTag - If cursor points to a block tag then increment the - /// cursor and return true otherwise return false. - bool readBlockTag() { - StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4); - if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\x41' || - Tag[3] != '\x01') { - return false; - } - Cursor += 4; - return true; - } - - /// readEdgeTag - If cursor points to an edge tag then increment the - /// cursor and return true otherwise return false. - bool readEdgeTag() { - StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4); - if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\x43' || - Tag[3] != '\x01') { - return false; - } - Cursor += 4; - return true; - } - - /// readLineTag - If cursor points to a line tag then increment the - /// cursor and return true otherwise return false. - bool readLineTag() { - StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4); - if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\x45' || - Tag[3] != '\x01') { - return false; - } - Cursor += 4; - return true; - } - - /// readArcTag - If cursor points to an gcda arc tag then increment the - /// cursor and return true otherwise return false. - bool readArcTag() { - StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4); - if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\xa1' || - Tag[3] != '\1') { - return false; - } - Cursor += 4; - return true; - } - - /// readObjectTag - If cursor points to an object summary tag then increment - /// the cursor and return true otherwise return false. - bool readObjectTag() { - StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4); - if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\0' || - Tag[3] != '\xa1') { - return false; - } - Cursor += 4; - return true; - } - - /// readProgramTag - If cursor points to a program summary tag then increment - /// the cursor and return true otherwise return false. - bool readProgramTag() { - StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor + 4); - if (Tag.empty() || Tag[0] != '\0' || Tag[1] != '\0' || Tag[2] != '\0' || - Tag[3] != '\xa3') { - return false; - } - Cursor += 4; - return true; - } - - bool readInt(uint32_t &Val) { - if (Buffer->getBuffer().size() < Cursor + 4) { - errs() << "Unexpected end of memory buffer: " << Cursor + 4 << ".\n"; - return false; - } - StringRef Str = Buffer->getBuffer().slice(Cursor, Cursor + 4); - Cursor += 4; - Val = *(const uint32_t *)(Str.data()); - return true; - } - - bool readInt64(uint64_t &Val) { - uint32_t Lo, Hi; - if (!readInt(Lo) || !readInt(Hi)) - return false; - Val = ((uint64_t)Hi << 32) | Lo; - return true; - } - - bool readString(StringRef &Str) { - uint32_t Len = 0; - // Keep reading until we find a non-zero length. This emulates gcov's - // behaviour, which appears to do the same. - while (Len == 0) - if (!readInt(Len)) - return false; - Len *= 4; - if (Buffer->getBuffer().size() < Cursor + Len) { - errs() << "Unexpected end of memory buffer: " << Cursor + Len << ".\n"; - return false; - } - Str = Buffer->getBuffer().slice(Cursor, Cursor + Len).split('\0').first; - Cursor += Len; - return true; - } - - uint64_t getCursor() const { return Cursor; } - void advanceCursor(uint32_t n) { Cursor += n * 4; } - -private: - MemoryBuffer *Buffer; - uint64_t Cursor = 0; -}; - -/// GCOVFile - Collects coverage information for one pair of coverage file -/// (.gcno and .gcda). -class GCOVFile { -public: - GCOVFile() = default; - - bool readGCNO(GCOVBuffer &Buffer); - bool readGCDA(GCOVBuffer &Buffer); - uint32_t getChecksum() const { return Checksum; } - void print(raw_ostream &OS) const; - void dump() const; - void collectLineCounts(FileInfo &FI); - -private: - bool GCNOInitialized = false; - GCOV::GCOVVersion Version; - uint32_t Checksum = 0; - SmallVector, 16> Functions; - uint32_t RunCount = 0; - uint32_t ProgramCount = 0; -}; - -/// GCOVEdge - Collects edge information. -struct GCOVEdge { - GCOVEdge(GCOVBlock &S, GCOVBlock &D) : Src(S), Dst(D) {} - - GCOVBlock &Src; - GCOVBlock &Dst; - uint64_t Count = 0; -}; - -/// GCOVFunction - Collects function information. -class GCOVFunction { -public: - using BlockIterator = pointee_iterator>::const_iterator>; - - GCOVFunction(GCOVFile &P) : Parent(P) {} - - bool readGCNO(GCOVBuffer &Buffer, GCOV::GCOVVersion Version); - bool readGCDA(GCOVBuffer &Buffer, GCOV::GCOVVersion Version); - StringRef getName() const { return Name; } - StringRef getFilename() const { return Filename; } - size_t getNumBlocks() const { return Blocks.size(); } - uint64_t getEntryCount() const; - uint64_t getExitCount() const; - - BlockIterator block_begin() const { return Blocks.begin(); } - BlockIterator block_end() const { return Blocks.end(); } - iterator_range blocks() const { - return make_range(block_begin(), block_end()); - } - - void print(raw_ostream &OS) const; - void dump() const; - void collectLineCounts(FileInfo &FI); - -private: - GCOVFile &Parent; - uint32_t Ident = 0; - uint32_t Checksum; - uint32_t LineNumber = 0; - StringRef Name; - StringRef Filename; - SmallVector, 16> Blocks; - SmallVector, 16> Edges; -}; - -/// GCOVBlock - Collects block information. -class GCOVBlock { - struct EdgeWeight { - EdgeWeight(GCOVBlock *D) : Dst(D) {} - - GCOVBlock *Dst; - uint64_t Count = 0; - }; - - struct SortDstEdgesFunctor { - bool operator()(const GCOVEdge *E1, const GCOVEdge *E2) { - return E1->Dst.Number < E2->Dst.Number; - } - }; - -public: - using EdgeIterator = SmallVectorImpl::const_iterator; - - GCOVBlock(GCOVFunction &P, uint32_t N) : Parent(P), Number(N) {} - ~GCOVBlock(); - - const GCOVFunction &getParent() const { return Parent; } - void addLine(uint32_t N) { Lines.push_back(N); } - uint32_t getLastLine() const { return Lines.back(); } - void addCount(size_t DstEdgeNo, uint64_t N); - uint64_t getCount() const { return Counter; } - - void addSrcEdge(GCOVEdge *Edge) { - assert(&Edge->Dst == this); // up to caller to ensure edge is valid - SrcEdges.push_back(Edge); - } - - void addDstEdge(GCOVEdge *Edge) { - assert(&Edge->Src == this); // up to caller to ensure edge is valid - // Check if adding this edge causes list to become unsorted. - if (DstEdges.size() && DstEdges.back()->Dst.Number > Edge->Dst.Number) - DstEdgesAreSorted = false; - DstEdges.push_back(Edge); - } - - size_t getNumSrcEdges() const { return SrcEdges.size(); } - size_t getNumDstEdges() const { return DstEdges.size(); } - void sortDstEdges(); - - EdgeIterator src_begin() const { return SrcEdges.begin(); } - EdgeIterator src_end() const { return SrcEdges.end(); } - iterator_range srcs() const { - return make_range(src_begin(), src_end()); - } - - EdgeIterator dst_begin() const { return DstEdges.begin(); } - EdgeIterator dst_end() const { return DstEdges.end(); } - iterator_range dsts() const { - return make_range(dst_begin(), dst_end()); - } - - void print(raw_ostream &OS) const; - void dump() const; - void collectLineCounts(FileInfo &FI); - -private: - GCOVFunction &Parent; - uint32_t Number; - uint64_t Counter = 0; - bool DstEdgesAreSorted = true; - SmallVector SrcEdges; - SmallVector DstEdges; - SmallVector Lines; -}; - -class FileInfo { - // It is unlikely--but possible--for multiple functions to be on the same - // line. - // Therefore this typedef allows LineData.Functions to store multiple - // functions - // per instance. This is rare, however, so optimize for the common case. - using FunctionVector = SmallVector; - using FunctionLines = DenseMap; - using BlockVector = SmallVector; - using BlockLines = DenseMap; - - struct LineData { - LineData() = default; - - BlockLines Blocks; - FunctionLines Functions; - uint32_t LastLine = 0; - }; - - struct GCOVCoverage { - GCOVCoverage(StringRef Name) : Name(Name) {} - - StringRef Name; - - uint32_t LogicalLines = 0; - uint32_t LinesExec = 0; - - uint32_t Branches = 0; - uint32_t BranchesExec = 0; - uint32_t BranchesTaken = 0; - }; - -public: - FileInfo(const GCOV::Options &Options) : Options(Options) {} - - void addBlockLine(StringRef Filename, uint32_t Line, const GCOVBlock *Block) { - if (Line > LineInfo[Filename].LastLine) - LineInfo[Filename].LastLine = Line; - LineInfo[Filename].Blocks[Line - 1].push_back(Block); - } - - void addFunctionLine(StringRef Filename, uint32_t Line, - const GCOVFunction *Function) { - if (Line > LineInfo[Filename].LastLine) - LineInfo[Filename].LastLine = Line; - LineInfo[Filename].Functions[Line - 1].push_back(Function); - } - - void setRunCount(uint32_t Runs) { RunCount = Runs; } - void setProgramCount(uint32_t Programs) { ProgramCount = Programs; } - void print(raw_ostream &OS, StringRef MainFilename, StringRef GCNOFile, - StringRef GCDAFile); - -private: - std::string getCoveragePath(StringRef Filename, StringRef MainFilename); - std::unique_ptr openCoveragePath(StringRef CoveragePath); - void printFunctionSummary(raw_ostream &OS, const FunctionVector &Funcs) const; - void printBlockInfo(raw_ostream &OS, const GCOVBlock &Block, - uint32_t LineIndex, uint32_t &BlockNo) const; - void printBranchInfo(raw_ostream &OS, const GCOVBlock &Block, - GCOVCoverage &Coverage, uint32_t &EdgeNo); - void printUncondBranchInfo(raw_ostream &OS, uint32_t &EdgeNo, - uint64_t Count) const; - - void printCoverage(raw_ostream &OS, const GCOVCoverage &Coverage) const; - void printFuncCoverage(raw_ostream &OS) const; - void printFileCoverage(raw_ostream &OS) const; - - const GCOV::Options &Options; - StringMap LineInfo; - uint32_t RunCount = 0; - uint32_t ProgramCount = 0; - - using FileCoverageList = SmallVector, 4>; - using FuncCoverageMap = MapVector; - - FileCoverageList FileCoverages; - FuncCoverageMap FuncCoverages; -}; - -} // end namespace llvm - -#endif // LLVM_SUPPORT_GCOV_H diff --git a/lib/IR/CMakeLists.txt b/lib/IR/CMakeLists.txt index eb4b9143090..17822bbbb5c 100644 --- a/lib/IR/CMakeLists.txt +++ b/lib/IR/CMakeLists.txt @@ -22,7 +22,6 @@ add_llvm_library(LLVMCore DiagnosticPrinter.cpp Dominators.cpp Function.cpp - GCOV.cpp GVMaterializer.cpp Globals.cpp IRBuilder.cpp diff --git a/lib/IR/GCOV.cpp b/lib/IR/GCOV.cpp deleted file mode 100644 index d4b45522822..00000000000 --- a/lib/IR/GCOV.cpp +++ /dev/null @@ -1,821 +0,0 @@ -//===- GCOV.cpp - LLVM coverage tool --------------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// GCOV implements the interface to read and write coverage files that use -// 'gcov' format. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/GCOV.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/Format.h" -#include "llvm/Support/Path.h" -#include "llvm/Support/raw_ostream.h" -#include -#include - -using namespace llvm; - -//===----------------------------------------------------------------------===// -// GCOVFile implementation. - -/// readGCNO - Read GCNO buffer. -bool GCOVFile::readGCNO(GCOVBuffer &Buffer) { - if (!Buffer.readGCNOFormat()) - return false; - if (!Buffer.readGCOVVersion(Version)) - return false; - - if (!Buffer.readInt(Checksum)) - return false; - while (true) { - if (!Buffer.readFunctionTag()) - break; - auto GFun = make_unique(*this); - if (!GFun->readGCNO(Buffer, Version)) - return false; - Functions.push_back(std::move(GFun)); - } - - GCNOInitialized = true; - return true; -} - -/// readGCDA - Read GCDA buffer. It is required that readGCDA() can only be -/// called after readGCNO(). -bool GCOVFile::readGCDA(GCOVBuffer &Buffer) { - assert(GCNOInitialized && "readGCDA() can only be called after readGCNO()"); - if (!Buffer.readGCDAFormat()) - return false; - GCOV::GCOVVersion GCDAVersion; - if (!Buffer.readGCOVVersion(GCDAVersion)) - return false; - if (Version != GCDAVersion) { - errs() << "GCOV versions do not match.\n"; - return false; - } - - uint32_t GCDAChecksum; - if (!Buffer.readInt(GCDAChecksum)) - return false; - if (Checksum != GCDAChecksum) { - errs() << "File checksums do not match: " << Checksum - << " != " << GCDAChecksum << ".\n"; - return false; - } - for (size_t i = 0, e = Functions.size(); i < e; ++i) { - if (!Buffer.readFunctionTag()) { - errs() << "Unexpected number of functions.\n"; - return false; - } - if (!Functions[i]->readGCDA(Buffer, Version)) - return false; - } - if (Buffer.readObjectTag()) { - uint32_t Length; - uint32_t Dummy; - if (!Buffer.readInt(Length)) - return false; - if (!Buffer.readInt(Dummy)) - return false; // checksum - if (!Buffer.readInt(Dummy)) - return false; // num - if (!Buffer.readInt(RunCount)) - return false; - Buffer.advanceCursor(Length - 3); - } - while (Buffer.readProgramTag()) { - uint32_t Length; - if (!Buffer.readInt(Length)) - return false; - Buffer.advanceCursor(Length); - ++ProgramCount; - } - - return true; -} - -void GCOVFile::print(raw_ostream &OS) const { - for (const auto &FPtr : Functions) - FPtr->print(OS); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -/// dump - Dump GCOVFile content to dbgs() for debugging purposes. -LLVM_DUMP_METHOD void GCOVFile::dump() const { - print(dbgs()); -} -#endif - -/// collectLineCounts - Collect line counts. This must be used after -/// reading .gcno and .gcda files. -void GCOVFile::collectLineCounts(FileInfo &FI) { - for (const auto &FPtr : Functions) - FPtr->collectLineCounts(FI); - FI.setRunCount(RunCount); - FI.setProgramCount(ProgramCount); -} - -//===----------------------------------------------------------------------===// -// GCOVFunction implementation. - -/// readGCNO - Read a function from the GCNO buffer. Return false if an error -/// occurs. -bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) { - uint32_t Dummy; - if (!Buff.readInt(Dummy)) - return false; // Function header length - if (!Buff.readInt(Ident)) - return false; - if (!Buff.readInt(Checksum)) - return false; - if (Version != GCOV::V402) { - uint32_t CfgChecksum; - if (!Buff.readInt(CfgChecksum)) - return false; - if (Parent.getChecksum() != CfgChecksum) { - errs() << "File checksums do not match: " << Parent.getChecksum() - << " != " << CfgChecksum << " in (" << Name << ").\n"; - return false; - } - } - if (!Buff.readString(Name)) - return false; - if (!Buff.readString(Filename)) - return false; - if (!Buff.readInt(LineNumber)) - return false; - - // read blocks. - if (!Buff.readBlockTag()) { - errs() << "Block tag not found.\n"; - return false; - } - uint32_t BlockCount; - if (!Buff.readInt(BlockCount)) - return false; - for (uint32_t i = 0, e = BlockCount; i != e; ++i) { - if (!Buff.readInt(Dummy)) - return false; // Block flags; - Blocks.push_back(make_unique(*this, i)); - } - - // read edges. - while (Buff.readEdgeTag()) { - uint32_t EdgeCount; - if (!Buff.readInt(EdgeCount)) - return false; - EdgeCount = (EdgeCount - 1) / 2; - uint32_t BlockNo; - if (!Buff.readInt(BlockNo)) - return false; - if (BlockNo >= BlockCount) { - errs() << "Unexpected block number: " << BlockNo << " (in " << Name - << ").\n"; - return false; - } - for (uint32_t i = 0, e = EdgeCount; i != e; ++i) { - uint32_t Dst; - if (!Buff.readInt(Dst)) - return false; - Edges.push_back(make_unique(*Blocks[BlockNo], *Blocks[Dst])); - GCOVEdge *Edge = Edges.back().get(); - Blocks[BlockNo]->addDstEdge(Edge); - Blocks[Dst]->addSrcEdge(Edge); - if (!Buff.readInt(Dummy)) - return false; // Edge flag - } - } - - // read line table. - while (Buff.readLineTag()) { - uint32_t LineTableLength; - // Read the length of this line table. - if (!Buff.readInt(LineTableLength)) - return false; - uint32_t EndPos = Buff.getCursor() + LineTableLength * 4; - uint32_t BlockNo; - // Read the block number this table is associated with. - if (!Buff.readInt(BlockNo)) - return false; - if (BlockNo >= BlockCount) { - errs() << "Unexpected block number: " << BlockNo << " (in " << Name - << ").\n"; - return false; - } - GCOVBlock &Block = *Blocks[BlockNo]; - // Read the word that pads the beginning of the line table. This may be a - // flag of some sort, but seems to always be zero. - if (!Buff.readInt(Dummy)) - return false; - - // Line information starts here and continues up until the last word. - if (Buff.getCursor() != (EndPos - sizeof(uint32_t))) { - StringRef F; - // Read the source file name. - if (!Buff.readString(F)) - return false; - if (Filename != F) { - errs() << "Multiple sources for a single basic block: " << Filename - << " != " << F << " (in " << Name << ").\n"; - return false; - } - // Read lines up to, but not including, the null terminator. - while (Buff.getCursor() < (EndPos - 2 * sizeof(uint32_t))) { - uint32_t Line; - if (!Buff.readInt(Line)) - return false; - // Line 0 means this instruction was injected by the compiler. Skip it. - if (!Line) - continue; - Block.addLine(Line); - } - // Read the null terminator. - if (!Buff.readInt(Dummy)) - return false; - } - // The last word is either a flag or padding, it isn't clear which. Skip - // over it. - if (!Buff.readInt(Dummy)) - return false; - } - return true; -} - -/// readGCDA - Read a function from the GCDA buffer. Return false if an error -/// occurs. -bool GCOVFunction::readGCDA(GCOVBuffer &Buff, GCOV::GCOVVersion Version) { - uint32_t HeaderLength; - if (!Buff.readInt(HeaderLength)) - return false; // Function header length - - uint64_t EndPos = Buff.getCursor() + HeaderLength * sizeof(uint32_t); - - uint32_t GCDAIdent; - if (!Buff.readInt(GCDAIdent)) - return false; - if (Ident != GCDAIdent) { - errs() << "Function identifiers do not match: " << Ident - << " != " << GCDAIdent << " (in " << Name << ").\n"; - return false; - } - - uint32_t GCDAChecksum; - if (!Buff.readInt(GCDAChecksum)) - return false; - if (Checksum != GCDAChecksum) { - errs() << "Function checksums do not match: " << Checksum - << " != " << GCDAChecksum << " (in " << Name << ").\n"; - return false; - } - - uint32_t CfgChecksum; - if (Version != GCOV::V402) { - if (!Buff.readInt(CfgChecksum)) - return false; - if (Parent.getChecksum() != CfgChecksum) { - errs() << "File checksums do not match: " << Parent.getChecksum() - << " != " << CfgChecksum << " (in " << Name << ").\n"; - return false; - } - } - - if (Buff.getCursor() < EndPos) { - StringRef GCDAName; - if (!Buff.readString(GCDAName)) - return false; - if (Name != GCDAName) { - errs() << "Function names do not match: " << Name << " != " << GCDAName - << ".\n"; - return false; - } - } - - if (!Buff.readArcTag()) { - errs() << "Arc tag not found (in " << Name << ").\n"; - return false; - } - - uint32_t Count; - if (!Buff.readInt(Count)) - return false; - Count /= 2; - - // This for loop adds the counts for each block. A second nested loop is - // required to combine the edge counts that are contained in the GCDA file. - for (uint32_t BlockNo = 0; Count > 0; ++BlockNo) { - // The last block is always reserved for exit block - if (BlockNo >= Blocks.size()) { - errs() << "Unexpected number of edges (in " << Name << ").\n"; - return false; - } - if (BlockNo == Blocks.size() - 1) - errs() << "(" << Name << ") has arcs from exit block.\n"; - GCOVBlock &Block = *Blocks[BlockNo]; - for (size_t EdgeNo = 0, End = Block.getNumDstEdges(); EdgeNo < End; - ++EdgeNo) { - if (Count == 0) { - errs() << "Unexpected number of edges (in " << Name << ").\n"; - return false; - } - uint64_t ArcCount; - if (!Buff.readInt64(ArcCount)) - return false; - Block.addCount(EdgeNo, ArcCount); - --Count; - } - Block.sortDstEdges(); - } - return true; -} - -/// getEntryCount - Get the number of times the function was called by -/// retrieving the entry block's count. -uint64_t GCOVFunction::getEntryCount() const { - return Blocks.front()->getCount(); -} - -/// getExitCount - Get the number of times the function returned by retrieving -/// the exit block's count. -uint64_t GCOVFunction::getExitCount() const { - return Blocks.back()->getCount(); -} - -void GCOVFunction::print(raw_ostream &OS) const { - OS << "===== " << Name << " (" << Ident << ") @ " << Filename << ":" - << LineNumber << "\n"; - for (const auto &Block : Blocks) - Block->print(OS); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -/// dump - Dump GCOVFunction content to dbgs() for debugging purposes. -LLVM_DUMP_METHOD void GCOVFunction::dump() const { - print(dbgs()); -} -#endif - -/// collectLineCounts - Collect line counts. This must be used after -/// reading .gcno and .gcda files. -void GCOVFunction::collectLineCounts(FileInfo &FI) { - // If the line number is zero, this is a function that doesn't actually appear - // in the source file, so there isn't anything we can do with it. - if (LineNumber == 0) - return; - - for (const auto &Block : Blocks) - Block->collectLineCounts(FI); - FI.addFunctionLine(Filename, LineNumber, this); -} - -//===----------------------------------------------------------------------===// -// GCOVBlock implementation. - -/// ~GCOVBlock - Delete GCOVBlock and its content. -GCOVBlock::~GCOVBlock() { - SrcEdges.clear(); - DstEdges.clear(); - Lines.clear(); -} - -/// addCount - Add to block counter while storing the edge count. If the -/// destination has no outgoing edges, also update that block's count too. -void GCOVBlock::addCount(size_t DstEdgeNo, uint64_t N) { - assert(DstEdgeNo < DstEdges.size()); // up to caller to ensure EdgeNo is valid - DstEdges[DstEdgeNo]->Count = N; - Counter += N; - if (!DstEdges[DstEdgeNo]->Dst.getNumDstEdges()) - DstEdges[DstEdgeNo]->Dst.Counter += N; -} - -/// sortDstEdges - Sort destination edges by block number, nop if already -/// sorted. This is required for printing branch info in the correct order. -void GCOVBlock::sortDstEdges() { - if (!DstEdgesAreSorted) { - SortDstEdgesFunctor SortEdges; - std::stable_sort(DstEdges.begin(), DstEdges.end(), SortEdges); - } -} - -/// collectLineCounts - Collect line counts. This must be used after -/// reading .gcno and .gcda files. -void GCOVBlock::collectLineCounts(FileInfo &FI) { - for (uint32_t N : Lines) - FI.addBlockLine(Parent.getFilename(), N, this); -} - -void GCOVBlock::print(raw_ostream &OS) const { - OS << "Block : " << Number << " Counter : " << Counter << "\n"; - if (!SrcEdges.empty()) { - OS << "\tSource Edges : "; - for (const GCOVEdge *Edge : SrcEdges) - OS << Edge->Src.Number << " (" << Edge->Count << "), "; - OS << "\n"; - } - if (!DstEdges.empty()) { - OS << "\tDestination Edges : "; - for (const GCOVEdge *Edge : DstEdges) - OS << Edge->Dst.Number << " (" << Edge->Count << "), "; - OS << "\n"; - } - if (!Lines.empty()) { - OS << "\tLines : "; - for (uint32_t N : Lines) - OS << (N) << ","; - OS << "\n"; - } -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -/// dump - Dump GCOVBlock content to dbgs() for debugging purposes. -LLVM_DUMP_METHOD void GCOVBlock::dump() const { - print(dbgs()); -} -#endif - -//===----------------------------------------------------------------------===// -// FileInfo implementation. - -// Safe integer division, returns 0 if numerator is 0. -static uint32_t safeDiv(uint64_t Numerator, uint64_t Divisor) { - if (!Numerator) - return 0; - return Numerator / Divisor; -} - -// This custom division function mimics gcov's branch ouputs: -// - Round to closest whole number -// - Only output 0% or 100% if it's exactly that value -static uint32_t branchDiv(uint64_t Numerator, uint64_t Divisor) { - if (!Numerator) - return 0; - if (Numerator == Divisor) - return 100; - - uint8_t Res = (Numerator * 100 + Divisor / 2) / Divisor; - if (Res == 0) - return 1; - if (Res == 100) - return 99; - return Res; -} - -namespace { -struct formatBranchInfo { - formatBranchInfo(const GCOV::Options &Options, uint64_t Count, uint64_t Total) - : Options(Options), Count(Count), Total(Total) {} - - void print(raw_ostream &OS) const { - if (!Total) - OS << "never executed"; - else if (Options.BranchCount) - OS << "taken " << Count; - else - OS << "taken " << branchDiv(Count, Total) << "%"; - } - - const GCOV::Options &Options; - uint64_t Count; - uint64_t Total; -}; - -static raw_ostream &operator<<(raw_ostream &OS, const formatBranchInfo &FBI) { - FBI.print(OS); - return OS; -} - -class LineConsumer { - std::unique_ptr Buffer; - StringRef Remaining; - -public: - LineConsumer(StringRef Filename) { - ErrorOr> BufferOrErr = - MemoryBuffer::getFileOrSTDIN(Filename); - if (std::error_code EC = BufferOrErr.getError()) { - errs() << Filename << ": " << EC.message() << "\n"; - Remaining = ""; - } else { - Buffer = std::move(BufferOrErr.get()); - Remaining = Buffer->getBuffer(); - } - } - bool empty() { return Remaining.empty(); } - void printNext(raw_ostream &OS, uint32_t LineNum) { - StringRef Line; - if (empty()) - Line = "/*EOF*/"; - else - std::tie(Line, Remaining) = Remaining.split("\n"); - OS << format("%5u:", LineNum) << Line << "\n"; - } -}; -} // end anonymous namespace - -/// Convert a path to a gcov filename. If PreservePaths is true, this -/// translates "/" to "#", ".." to "^", and drops ".", to match gcov. -static std::string mangleCoveragePath(StringRef Filename, bool PreservePaths) { - if (!PreservePaths) - return sys::path::filename(Filename).str(); - - // This behaviour is defined by gcov in terms of text replacements, so it's - // not likely to do anything useful on filesystems with different textual - // conventions. - llvm::SmallString<256> Result(""); - StringRef::iterator I, S, E; - for (I = S = Filename.begin(), E = Filename.end(); I != E; ++I) { - if (*I != '/') - continue; - - if (I - S == 1 && *S == '.') { - // ".", the current directory, is skipped. - } else if (I - S == 2 && *S == '.' && *(S + 1) == '.') { - // "..", the parent directory, is replaced with "^". - Result.append("^#"); - } else { - if (S < I) - // Leave other components intact, - Result.append(S, I); - // And separate with "#". - Result.push_back('#'); - } - S = I + 1; - } - - if (S < I) - Result.append(S, I); - return Result.str(); -} - -std::string FileInfo::getCoveragePath(StringRef Filename, - StringRef MainFilename) { - if (Options.NoOutput) - // This is probably a bug in gcov, but when -n is specified, paths aren't - // mangled at all, and the -l and -p options are ignored. Here, we do the - // same. - return Filename; - - std::string CoveragePath; - if (Options.LongFileNames && !Filename.equals(MainFilename)) - CoveragePath = - mangleCoveragePath(MainFilename, Options.PreservePaths) + "##"; - CoveragePath += mangleCoveragePath(Filename, Options.PreservePaths) + ".gcov"; - return CoveragePath; -} - -std::unique_ptr -FileInfo::openCoveragePath(StringRef CoveragePath) { - if (Options.NoOutput) - return llvm::make_unique(); - - std::error_code EC; - auto OS = llvm::make_unique(CoveragePath, EC, - sys::fs::F_Text); - if (EC) { - errs() << EC.message() << "\n"; - return llvm::make_unique(); - } - return std::move(OS); -} - -/// print - Print source files with collected line count information. -void FileInfo::print(raw_ostream &InfoOS, StringRef MainFilename, - StringRef GCNOFile, StringRef GCDAFile) { - SmallVector Filenames; - for (const auto &LI : LineInfo) - Filenames.push_back(LI.first()); - std::sort(Filenames.begin(), Filenames.end()); - - for (StringRef Filename : Filenames) { - auto AllLines = LineConsumer(Filename); - - std::string CoveragePath = getCoveragePath(Filename, MainFilename); - std::unique_ptr CovStream = openCoveragePath(CoveragePath); - raw_ostream &CovOS = *CovStream; - - CovOS << " -: 0:Source:" << Filename << "\n"; - CovOS << " -: 0:Graph:" << GCNOFile << "\n"; - CovOS << " -: 0:Data:" << GCDAFile << "\n"; - CovOS << " -: 0:Runs:" << RunCount << "\n"; - CovOS << " -: 0:Programs:" << ProgramCount << "\n"; - - const LineData &Line = LineInfo[Filename]; - GCOVCoverage FileCoverage(Filename); - for (uint32_t LineIndex = 0; LineIndex < Line.LastLine || !AllLines.empty(); - ++LineIndex) { - if (Options.BranchInfo) { - FunctionLines::const_iterator FuncsIt = Line.Functions.find(LineIndex); - if (FuncsIt != Line.Functions.end()) - printFunctionSummary(CovOS, FuncsIt->second); - } - - BlockLines::const_iterator BlocksIt = Line.Blocks.find(LineIndex); - if (BlocksIt == Line.Blocks.end()) { - // No basic blocks are on this line. Not an executable line of code. - CovOS << " -:"; - AllLines.printNext(CovOS, LineIndex + 1); - } else { - const BlockVector &Blocks = BlocksIt->second; - - // Add up the block counts to form line counts. - DenseMap LineExecs; - uint64_t LineCount = 0; - for (const GCOVBlock *Block : Blocks) { - if (Options.AllBlocks) { - // Only take the highest block count for that line. - uint64_t BlockCount = Block->getCount(); - LineCount = LineCount > BlockCount ? LineCount : BlockCount; - } else { - // Sum up all of the block counts. - LineCount += Block->getCount(); - } - - if (Options.FuncCoverage) { - // This is a slightly convoluted way to most accurately gather line - // statistics for functions. Basically what is happening is that we - // don't want to count a single line with multiple blocks more than - // once. However, we also don't simply want to give the total line - // count to every function that starts on the line. Thus, what is - // happening here are two things: - // 1) Ensure that the number of logical lines is only incremented - // once per function. - // 2) If there are multiple blocks on the same line, ensure that the - // number of lines executed is incremented as long as at least - // one of the blocks are executed. - const GCOVFunction *Function = &Block->getParent(); - if (FuncCoverages.find(Function) == FuncCoverages.end()) { - std::pair KeyValue( - Function, GCOVCoverage(Function->getName())); - FuncCoverages.insert(KeyValue); - } - GCOVCoverage &FuncCoverage = FuncCoverages.find(Function)->second; - - if (LineExecs.find(Function) == LineExecs.end()) { - if (Block->getCount()) { - ++FuncCoverage.LinesExec; - LineExecs[Function] = true; - } else { - LineExecs[Function] = false; - } - ++FuncCoverage.LogicalLines; - } else if (!LineExecs[Function] && Block->getCount()) { - ++FuncCoverage.LinesExec; - LineExecs[Function] = true; - } - } - } - - if (LineCount == 0) - CovOS << " #####:"; - else { - CovOS << format("%9" PRIu64 ":", LineCount); - ++FileCoverage.LinesExec; - } - ++FileCoverage.LogicalLines; - - AllLines.printNext(CovOS, LineIndex + 1); - - uint32_t BlockNo = 0; - uint32_t EdgeNo = 0; - for (const GCOVBlock *Block : Blocks) { - // Only print block and branch information at the end of the block. - if (Block->getLastLine() != LineIndex + 1) - continue; - if (Options.AllBlocks) - printBlockInfo(CovOS, *Block, LineIndex, BlockNo); - if (Options.BranchInfo) { - size_t NumEdges = Block->getNumDstEdges(); - if (NumEdges > 1) - printBranchInfo(CovOS, *Block, FileCoverage, EdgeNo); - else if (Options.UncondBranch && NumEdges == 1) - printUncondBranchInfo(CovOS, EdgeNo, - (*Block->dst_begin())->Count); - } - } - } - } - FileCoverages.push_back(std::make_pair(CoveragePath, FileCoverage)); - } - - // FIXME: There is no way to detect calls given current instrumentation. - if (Options.FuncCoverage) - printFuncCoverage(InfoOS); - printFileCoverage(InfoOS); -} - -/// printFunctionSummary - Print function and block summary. -void FileInfo::printFunctionSummary(raw_ostream &OS, - const FunctionVector &Funcs) const { - for (const GCOVFunction *Func : Funcs) { - uint64_t EntryCount = Func->getEntryCount(); - uint32_t BlocksExec = 0; - for (const GCOVBlock &Block : Func->blocks()) - if (Block.getNumDstEdges() && Block.getCount()) - ++BlocksExec; - - OS << "function " << Func->getName() << " called " << EntryCount - << " returned " << safeDiv(Func->getExitCount() * 100, EntryCount) - << "% blocks executed " - << safeDiv(BlocksExec * 100, Func->getNumBlocks() - 1) << "%\n"; - } -} - -/// printBlockInfo - Output counts for each block. -void FileInfo::printBlockInfo(raw_ostream &OS, const GCOVBlock &Block, - uint32_t LineIndex, uint32_t &BlockNo) const { - if (Block.getCount() == 0) - OS << " $$$$$:"; - else - OS << format("%9" PRIu64 ":", Block.getCount()); - OS << format("%5u-block %2u\n", LineIndex + 1, BlockNo++); -} - -/// printBranchInfo - Print conditional branch probabilities. -void FileInfo::printBranchInfo(raw_ostream &OS, const GCOVBlock &Block, - GCOVCoverage &Coverage, uint32_t &EdgeNo) { - SmallVector BranchCounts; - uint64_t TotalCounts = 0; - for (const GCOVEdge *Edge : Block.dsts()) { - BranchCounts.push_back(Edge->Count); - TotalCounts += Edge->Count; - if (Block.getCount()) - ++Coverage.BranchesExec; - if (Edge->Count) - ++Coverage.BranchesTaken; - ++Coverage.Branches; - - if (Options.FuncCoverage) { - const GCOVFunction *Function = &Block.getParent(); - GCOVCoverage &FuncCoverage = FuncCoverages.find(Function)->second; - if (Block.getCount()) - ++FuncCoverage.BranchesExec; - if (Edge->Count) - ++FuncCoverage.BranchesTaken; - ++FuncCoverage.Branches; - } - } - - for (uint64_t N : BranchCounts) - OS << format("branch %2u ", EdgeNo++) - << formatBranchInfo(Options, N, TotalCounts) << "\n"; -} - -/// printUncondBranchInfo - Print unconditional branch probabilities. -void FileInfo::printUncondBranchInfo(raw_ostream &OS, uint32_t &EdgeNo, - uint64_t Count) const { - OS << format("unconditional %2u ", EdgeNo++) - << formatBranchInfo(Options, Count, Count) << "\n"; -} - -// printCoverage - Print generic coverage info used by both printFuncCoverage -// and printFileCoverage. -void FileInfo::printCoverage(raw_ostream &OS, - const GCOVCoverage &Coverage) const { - OS << format("Lines executed:%.2f%% of %u\n", - double(Coverage.LinesExec) * 100 / Coverage.LogicalLines, - Coverage.LogicalLines); - if (Options.BranchInfo) { - if (Coverage.Branches) { - OS << format("Branches executed:%.2f%% of %u\n", - double(Coverage.BranchesExec) * 100 / Coverage.Branches, - Coverage.Branches); - OS << format("Taken at least once:%.2f%% of %u\n", - double(Coverage.BranchesTaken) * 100 / Coverage.Branches, - Coverage.Branches); - } else { - OS << "No branches\n"; - } - OS << "No calls\n"; // to be consistent with gcov - } -} - -// printFuncCoverage - Print per-function coverage info. -void FileInfo::printFuncCoverage(raw_ostream &OS) const { - for (const auto &FC : FuncCoverages) { - const GCOVCoverage &Coverage = FC.second; - OS << "Function '" << Coverage.Name << "'\n"; - printCoverage(OS, Coverage); - OS << "\n"; - } -} - -// printFileCoverage - Print per-file coverage info. -void FileInfo::printFileCoverage(raw_ostream &OS) const { - for (const auto &FC : FileCoverages) { - const std::string &Filename = FC.first; - const GCOVCoverage &Coverage = FC.second; - OS << "File '" << Coverage.Name << "'\n"; - printCoverage(OS, Coverage); - if (!Options.NoOutput) - OS << Coverage.Name << ":creating '" << Filename << "'\n"; - OS << "\n"; - } -} diff --git a/lib/ProfileData/CMakeLists.txt b/lib/ProfileData/CMakeLists.txt index cd65762ae6a..3a981d8acf4 100644 --- a/lib/ProfileData/CMakeLists.txt +++ b/lib/ProfileData/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_library(LLVMProfileData + GCOV.cpp InstrProf.cpp InstrProfReader.cpp InstrProfWriter.cpp diff --git a/lib/ProfileData/GCOV.cpp b/lib/ProfileData/GCOV.cpp new file mode 100644 index 00000000000..d6e44389f2b --- /dev/null +++ b/lib/ProfileData/GCOV.cpp @@ -0,0 +1,821 @@ +//===- GCOV.cpp - LLVM coverage tool --------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// GCOV implements the interface to read and write coverage files that use +// 'gcov' format. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ProfileData/GCOV.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// GCOVFile implementation. + +/// readGCNO - Read GCNO buffer. +bool GCOVFile::readGCNO(GCOVBuffer &Buffer) { + if (!Buffer.readGCNOFormat()) + return false; + if (!Buffer.readGCOVVersion(Version)) + return false; + + if (!Buffer.readInt(Checksum)) + return false; + while (true) { + if (!Buffer.readFunctionTag()) + break; + auto GFun = make_unique(*this); + if (!GFun->readGCNO(Buffer, Version)) + return false; + Functions.push_back(std::move(GFun)); + } + + GCNOInitialized = true; + return true; +} + +/// readGCDA - Read GCDA buffer. It is required that readGCDA() can only be +/// called after readGCNO(). +bool GCOVFile::readGCDA(GCOVBuffer &Buffer) { + assert(GCNOInitialized && "readGCDA() can only be called after readGCNO()"); + if (!Buffer.readGCDAFormat()) + return false; + GCOV::GCOVVersion GCDAVersion; + if (!Buffer.readGCOVVersion(GCDAVersion)) + return false; + if (Version != GCDAVersion) { + errs() << "GCOV versions do not match.\n"; + return false; + } + + uint32_t GCDAChecksum; + if (!Buffer.readInt(GCDAChecksum)) + return false; + if (Checksum != GCDAChecksum) { + errs() << "File checksums do not match: " << Checksum + << " != " << GCDAChecksum << ".\n"; + return false; + } + for (size_t i = 0, e = Functions.size(); i < e; ++i) { + if (!Buffer.readFunctionTag()) { + errs() << "Unexpected number of functions.\n"; + return false; + } + if (!Functions[i]->readGCDA(Buffer, Version)) + return false; + } + if (Buffer.readObjectTag()) { + uint32_t Length; + uint32_t Dummy; + if (!Buffer.readInt(Length)) + return false; + if (!Buffer.readInt(Dummy)) + return false; // checksum + if (!Buffer.readInt(Dummy)) + return false; // num + if (!Buffer.readInt(RunCount)) + return false; + Buffer.advanceCursor(Length - 3); + } + while (Buffer.readProgramTag()) { + uint32_t Length; + if (!Buffer.readInt(Length)) + return false; + Buffer.advanceCursor(Length); + ++ProgramCount; + } + + return true; +} + +void GCOVFile::print(raw_ostream &OS) const { + for (const auto &FPtr : Functions) + FPtr->print(OS); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +/// dump - Dump GCOVFile content to dbgs() for debugging purposes. +LLVM_DUMP_METHOD void GCOVFile::dump() const { + print(dbgs()); +} +#endif + +/// collectLineCounts - Collect line counts. This must be used after +/// reading .gcno and .gcda files. +void GCOVFile::collectLineCounts(FileInfo &FI) { + for (const auto &FPtr : Functions) + FPtr->collectLineCounts(FI); + FI.setRunCount(RunCount); + FI.setProgramCount(ProgramCount); +} + +//===----------------------------------------------------------------------===// +// GCOVFunction implementation. + +/// readGCNO - Read a function from the GCNO buffer. Return false if an error +/// occurs. +bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) { + uint32_t Dummy; + if (!Buff.readInt(Dummy)) + return false; // Function header length + if (!Buff.readInt(Ident)) + return false; + if (!Buff.readInt(Checksum)) + return false; + if (Version != GCOV::V402) { + uint32_t CfgChecksum; + if (!Buff.readInt(CfgChecksum)) + return false; + if (Parent.getChecksum() != CfgChecksum) { + errs() << "File checksums do not match: " << Parent.getChecksum() + << " != " << CfgChecksum << " in (" << Name << ").\n"; + return false; + } + } + if (!Buff.readString(Name)) + return false; + if (!Buff.readString(Filename)) + return false; + if (!Buff.readInt(LineNumber)) + return false; + + // read blocks. + if (!Buff.readBlockTag()) { + errs() << "Block tag not found.\n"; + return false; + } + uint32_t BlockCount; + if (!Buff.readInt(BlockCount)) + return false; + for (uint32_t i = 0, e = BlockCount; i != e; ++i) { + if (!Buff.readInt(Dummy)) + return false; // Block flags; + Blocks.push_back(make_unique(*this, i)); + } + + // read edges. + while (Buff.readEdgeTag()) { + uint32_t EdgeCount; + if (!Buff.readInt(EdgeCount)) + return false; + EdgeCount = (EdgeCount - 1) / 2; + uint32_t BlockNo; + if (!Buff.readInt(BlockNo)) + return false; + if (BlockNo >= BlockCount) { + errs() << "Unexpected block number: " << BlockNo << " (in " << Name + << ").\n"; + return false; + } + for (uint32_t i = 0, e = EdgeCount; i != e; ++i) { + uint32_t Dst; + if (!Buff.readInt(Dst)) + return false; + Edges.push_back(make_unique(*Blocks[BlockNo], *Blocks[Dst])); + GCOVEdge *Edge = Edges.back().get(); + Blocks[BlockNo]->addDstEdge(Edge); + Blocks[Dst]->addSrcEdge(Edge); + if (!Buff.readInt(Dummy)) + return false; // Edge flag + } + } + + // read line table. + while (Buff.readLineTag()) { + uint32_t LineTableLength; + // Read the length of this line table. + if (!Buff.readInt(LineTableLength)) + return false; + uint32_t EndPos = Buff.getCursor() + LineTableLength * 4; + uint32_t BlockNo; + // Read the block number this table is associated with. + if (!Buff.readInt(BlockNo)) + return false; + if (BlockNo >= BlockCount) { + errs() << "Unexpected block number: " << BlockNo << " (in " << Name + << ").\n"; + return false; + } + GCOVBlock &Block = *Blocks[BlockNo]; + // Read the word that pads the beginning of the line table. This may be a + // flag of some sort, but seems to always be zero. + if (!Buff.readInt(Dummy)) + return false; + + // Line information starts here and continues up until the last word. + if (Buff.getCursor() != (EndPos - sizeof(uint32_t))) { + StringRef F; + // Read the source file name. + if (!Buff.readString(F)) + return false; + if (Filename != F) { + errs() << "Multiple sources for a single basic block: " << Filename + << " != " << F << " (in " << Name << ").\n"; + return false; + } + // Read lines up to, but not including, the null terminator. + while (Buff.getCursor() < (EndPos - 2 * sizeof(uint32_t))) { + uint32_t Line; + if (!Buff.readInt(Line)) + return false; + // Line 0 means this instruction was injected by the compiler. Skip it. + if (!Line) + continue; + Block.addLine(Line); + } + // Read the null terminator. + if (!Buff.readInt(Dummy)) + return false; + } + // The last word is either a flag or padding, it isn't clear which. Skip + // over it. + if (!Buff.readInt(Dummy)) + return false; + } + return true; +} + +/// readGCDA - Read a function from the GCDA buffer. Return false if an error +/// occurs. +bool GCOVFunction::readGCDA(GCOVBuffer &Buff, GCOV::GCOVVersion Version) { + uint32_t HeaderLength; + if (!Buff.readInt(HeaderLength)) + return false; // Function header length + + uint64_t EndPos = Buff.getCursor() + HeaderLength * sizeof(uint32_t); + + uint32_t GCDAIdent; + if (!Buff.readInt(GCDAIdent)) + return false; + if (Ident != GCDAIdent) { + errs() << "Function identifiers do not match: " << Ident + << " != " << GCDAIdent << " (in " << Name << ").\n"; + return false; + } + + uint32_t GCDAChecksum; + if (!Buff.readInt(GCDAChecksum)) + return false; + if (Checksum != GCDAChecksum) { + errs() << "Function checksums do not match: " << Checksum + << " != " << GCDAChecksum << " (in " << Name << ").\n"; + return false; + } + + uint32_t CfgChecksum; + if (Version != GCOV::V402) { + if (!Buff.readInt(CfgChecksum)) + return false; + if (Parent.getChecksum() != CfgChecksum) { + errs() << "File checksums do not match: " << Parent.getChecksum() + << " != " << CfgChecksum << " (in " << Name << ").\n"; + return false; + } + } + + if (Buff.getCursor() < EndPos) { + StringRef GCDAName; + if (!Buff.readString(GCDAName)) + return false; + if (Name != GCDAName) { + errs() << "Function names do not match: " << Name << " != " << GCDAName + << ".\n"; + return false; + } + } + + if (!Buff.readArcTag()) { + errs() << "Arc tag not found (in " << Name << ").\n"; + return false; + } + + uint32_t Count; + if (!Buff.readInt(Count)) + return false; + Count /= 2; + + // This for loop adds the counts for each block. A second nested loop is + // required to combine the edge counts that are contained in the GCDA file. + for (uint32_t BlockNo = 0; Count > 0; ++BlockNo) { + // The last block is always reserved for exit block + if (BlockNo >= Blocks.size()) { + errs() << "Unexpected number of edges (in " << Name << ").\n"; + return false; + } + if (BlockNo == Blocks.size() - 1) + errs() << "(" << Name << ") has arcs from exit block.\n"; + GCOVBlock &Block = *Blocks[BlockNo]; + for (size_t EdgeNo = 0, End = Block.getNumDstEdges(); EdgeNo < End; + ++EdgeNo) { + if (Count == 0) { + errs() << "Unexpected number of edges (in " << Name << ").\n"; + return false; + } + uint64_t ArcCount; + if (!Buff.readInt64(ArcCount)) + return false; + Block.addCount(EdgeNo, ArcCount); + --Count; + } + Block.sortDstEdges(); + } + return true; +} + +/// getEntryCount - Get the number of times the function was called by +/// retrieving the entry block's count. +uint64_t GCOVFunction::getEntryCount() const { + return Blocks.front()->getCount(); +} + +/// getExitCount - Get the number of times the function returned by retrieving +/// the exit block's count. +uint64_t GCOVFunction::getExitCount() const { + return Blocks.back()->getCount(); +} + +void GCOVFunction::print(raw_ostream &OS) const { + OS << "===== " << Name << " (" << Ident << ") @ " << Filename << ":" + << LineNumber << "\n"; + for (const auto &Block : Blocks) + Block->print(OS); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +/// dump - Dump GCOVFunction content to dbgs() for debugging purposes. +LLVM_DUMP_METHOD void GCOVFunction::dump() const { + print(dbgs()); +} +#endif + +/// collectLineCounts - Collect line counts. This must be used after +/// reading .gcno and .gcda files. +void GCOVFunction::collectLineCounts(FileInfo &FI) { + // If the line number is zero, this is a function that doesn't actually appear + // in the source file, so there isn't anything we can do with it. + if (LineNumber == 0) + return; + + for (const auto &Block : Blocks) + Block->collectLineCounts(FI); + FI.addFunctionLine(Filename, LineNumber, this); +} + +//===----------------------------------------------------------------------===// +// GCOVBlock implementation. + +/// ~GCOVBlock - Delete GCOVBlock and its content. +GCOVBlock::~GCOVBlock() { + SrcEdges.clear(); + DstEdges.clear(); + Lines.clear(); +} + +/// addCount - Add to block counter while storing the edge count. If the +/// destination has no outgoing edges, also update that block's count too. +void GCOVBlock::addCount(size_t DstEdgeNo, uint64_t N) { + assert(DstEdgeNo < DstEdges.size()); // up to caller to ensure EdgeNo is valid + DstEdges[DstEdgeNo]->Count = N; + Counter += N; + if (!DstEdges[DstEdgeNo]->Dst.getNumDstEdges()) + DstEdges[DstEdgeNo]->Dst.Counter += N; +} + +/// sortDstEdges - Sort destination edges by block number, nop if already +/// sorted. This is required for printing branch info in the correct order. +void GCOVBlock::sortDstEdges() { + if (!DstEdgesAreSorted) { + SortDstEdgesFunctor SortEdges; + std::stable_sort(DstEdges.begin(), DstEdges.end(), SortEdges); + } +} + +/// collectLineCounts - Collect line counts. This must be used after +/// reading .gcno and .gcda files. +void GCOVBlock::collectLineCounts(FileInfo &FI) { + for (uint32_t N : Lines) + FI.addBlockLine(Parent.getFilename(), N, this); +} + +void GCOVBlock::print(raw_ostream &OS) const { + OS << "Block : " << Number << " Counter : " << Counter << "\n"; + if (!SrcEdges.empty()) { + OS << "\tSource Edges : "; + for (const GCOVEdge *Edge : SrcEdges) + OS << Edge->Src.Number << " (" << Edge->Count << "), "; + OS << "\n"; + } + if (!DstEdges.empty()) { + OS << "\tDestination Edges : "; + for (const GCOVEdge *Edge : DstEdges) + OS << Edge->Dst.Number << " (" << Edge->Count << "), "; + OS << "\n"; + } + if (!Lines.empty()) { + OS << "\tLines : "; + for (uint32_t N : Lines) + OS << (N) << ","; + OS << "\n"; + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +/// dump - Dump GCOVBlock content to dbgs() for debugging purposes. +LLVM_DUMP_METHOD void GCOVBlock::dump() const { + print(dbgs()); +} +#endif + +//===----------------------------------------------------------------------===// +// FileInfo implementation. + +// Safe integer division, returns 0 if numerator is 0. +static uint32_t safeDiv(uint64_t Numerator, uint64_t Divisor) { + if (!Numerator) + return 0; + return Numerator / Divisor; +} + +// This custom division function mimics gcov's branch ouputs: +// - Round to closest whole number +// - Only output 0% or 100% if it's exactly that value +static uint32_t branchDiv(uint64_t Numerator, uint64_t Divisor) { + if (!Numerator) + return 0; + if (Numerator == Divisor) + return 100; + + uint8_t Res = (Numerator * 100 + Divisor / 2) / Divisor; + if (Res == 0) + return 1; + if (Res == 100) + return 99; + return Res; +} + +namespace { +struct formatBranchInfo { + formatBranchInfo(const GCOV::Options &Options, uint64_t Count, uint64_t Total) + : Options(Options), Count(Count), Total(Total) {} + + void print(raw_ostream &OS) const { + if (!Total) + OS << "never executed"; + else if (Options.BranchCount) + OS << "taken " << Count; + else + OS << "taken " << branchDiv(Count, Total) << "%"; + } + + const GCOV::Options &Options; + uint64_t Count; + uint64_t Total; +}; + +static raw_ostream &operator<<(raw_ostream &OS, const formatBranchInfo &FBI) { + FBI.print(OS); + return OS; +} + +class LineConsumer { + std::unique_ptr Buffer; + StringRef Remaining; + +public: + LineConsumer(StringRef Filename) { + ErrorOr> BufferOrErr = + MemoryBuffer::getFileOrSTDIN(Filename); + if (std::error_code EC = BufferOrErr.getError()) { + errs() << Filename << ": " << EC.message() << "\n"; + Remaining = ""; + } else { + Buffer = std::move(BufferOrErr.get()); + Remaining = Buffer->getBuffer(); + } + } + bool empty() { return Remaining.empty(); } + void printNext(raw_ostream &OS, uint32_t LineNum) { + StringRef Line; + if (empty()) + Line = "/*EOF*/"; + else + std::tie(Line, Remaining) = Remaining.split("\n"); + OS << format("%5u:", LineNum) << Line << "\n"; + } +}; +} // end anonymous namespace + +/// Convert a path to a gcov filename. If PreservePaths is true, this +/// translates "/" to "#", ".." to "^", and drops ".", to match gcov. +static std::string mangleCoveragePath(StringRef Filename, bool PreservePaths) { + if (!PreservePaths) + return sys::path::filename(Filename).str(); + + // This behaviour is defined by gcov in terms of text replacements, so it's + // not likely to do anything useful on filesystems with different textual + // conventions. + llvm::SmallString<256> Result(""); + StringRef::iterator I, S, E; + for (I = S = Filename.begin(), E = Filename.end(); I != E; ++I) { + if (*I != '/') + continue; + + if (I - S == 1 && *S == '.') { + // ".", the current directory, is skipped. + } else if (I - S == 2 && *S == '.' && *(S + 1) == '.') { + // "..", the parent directory, is replaced with "^". + Result.append("^#"); + } else { + if (S < I) + // Leave other components intact, + Result.append(S, I); + // And separate with "#". + Result.push_back('#'); + } + S = I + 1; + } + + if (S < I) + Result.append(S, I); + return Result.str(); +} + +std::string FileInfo::getCoveragePath(StringRef Filename, + StringRef MainFilename) { + if (Options.NoOutput) + // This is probably a bug in gcov, but when -n is specified, paths aren't + // mangled at all, and the -l and -p options are ignored. Here, we do the + // same. + return Filename; + + std::string CoveragePath; + if (Options.LongFileNames && !Filename.equals(MainFilename)) + CoveragePath = + mangleCoveragePath(MainFilename, Options.PreservePaths) + "##"; + CoveragePath += mangleCoveragePath(Filename, Options.PreservePaths) + ".gcov"; + return CoveragePath; +} + +std::unique_ptr +FileInfo::openCoveragePath(StringRef CoveragePath) { + if (Options.NoOutput) + return llvm::make_unique(); + + std::error_code EC; + auto OS = llvm::make_unique(CoveragePath, EC, + sys::fs::F_Text); + if (EC) { + errs() << EC.message() << "\n"; + return llvm::make_unique(); + } + return std::move(OS); +} + +/// print - Print source files with collected line count information. +void FileInfo::print(raw_ostream &InfoOS, StringRef MainFilename, + StringRef GCNOFile, StringRef GCDAFile) { + SmallVector Filenames; + for (const auto &LI : LineInfo) + Filenames.push_back(LI.first()); + std::sort(Filenames.begin(), Filenames.end()); + + for (StringRef Filename : Filenames) { + auto AllLines = LineConsumer(Filename); + + std::string CoveragePath = getCoveragePath(Filename, MainFilename); + std::unique_ptr CovStream = openCoveragePath(CoveragePath); + raw_ostream &CovOS = *CovStream; + + CovOS << " -: 0:Source:" << Filename << "\n"; + CovOS << " -: 0:Graph:" << GCNOFile << "\n"; + CovOS << " -: 0:Data:" << GCDAFile << "\n"; + CovOS << " -: 0:Runs:" << RunCount << "\n"; + CovOS << " -: 0:Programs:" << ProgramCount << "\n"; + + const LineData &Line = LineInfo[Filename]; + GCOVCoverage FileCoverage(Filename); + for (uint32_t LineIndex = 0; LineIndex < Line.LastLine || !AllLines.empty(); + ++LineIndex) { + if (Options.BranchInfo) { + FunctionLines::const_iterator FuncsIt = Line.Functions.find(LineIndex); + if (FuncsIt != Line.Functions.end()) + printFunctionSummary(CovOS, FuncsIt->second); + } + + BlockLines::const_iterator BlocksIt = Line.Blocks.find(LineIndex); + if (BlocksIt == Line.Blocks.end()) { + // No basic blocks are on this line. Not an executable line of code. + CovOS << " -:"; + AllLines.printNext(CovOS, LineIndex + 1); + } else { + const BlockVector &Blocks = BlocksIt->second; + + // Add up the block counts to form line counts. + DenseMap LineExecs; + uint64_t LineCount = 0; + for (const GCOVBlock *Block : Blocks) { + if (Options.AllBlocks) { + // Only take the highest block count for that line. + uint64_t BlockCount = Block->getCount(); + LineCount = LineCount > BlockCount ? LineCount : BlockCount; + } else { + // Sum up all of the block counts. + LineCount += Block->getCount(); + } + + if (Options.FuncCoverage) { + // This is a slightly convoluted way to most accurately gather line + // statistics for functions. Basically what is happening is that we + // don't want to count a single line with multiple blocks more than + // once. However, we also don't simply want to give the total line + // count to every function that starts on the line. Thus, what is + // happening here are two things: + // 1) Ensure that the number of logical lines is only incremented + // once per function. + // 2) If there are multiple blocks on the same line, ensure that the + // number of lines executed is incremented as long as at least + // one of the blocks are executed. + const GCOVFunction *Function = &Block->getParent(); + if (FuncCoverages.find(Function) == FuncCoverages.end()) { + std::pair KeyValue( + Function, GCOVCoverage(Function->getName())); + FuncCoverages.insert(KeyValue); + } + GCOVCoverage &FuncCoverage = FuncCoverages.find(Function)->second; + + if (LineExecs.find(Function) == LineExecs.end()) { + if (Block->getCount()) { + ++FuncCoverage.LinesExec; + LineExecs[Function] = true; + } else { + LineExecs[Function] = false; + } + ++FuncCoverage.LogicalLines; + } else if (!LineExecs[Function] && Block->getCount()) { + ++FuncCoverage.LinesExec; + LineExecs[Function] = true; + } + } + } + + if (LineCount == 0) + CovOS << " #####:"; + else { + CovOS << format("%9" PRIu64 ":", LineCount); + ++FileCoverage.LinesExec; + } + ++FileCoverage.LogicalLines; + + AllLines.printNext(CovOS, LineIndex + 1); + + uint32_t BlockNo = 0; + uint32_t EdgeNo = 0; + for (const GCOVBlock *Block : Blocks) { + // Only print block and branch information at the end of the block. + if (Block->getLastLine() != LineIndex + 1) + continue; + if (Options.AllBlocks) + printBlockInfo(CovOS, *Block, LineIndex, BlockNo); + if (Options.BranchInfo) { + size_t NumEdges = Block->getNumDstEdges(); + if (NumEdges > 1) + printBranchInfo(CovOS, *Block, FileCoverage, EdgeNo); + else if (Options.UncondBranch && NumEdges == 1) + printUncondBranchInfo(CovOS, EdgeNo, + (*Block->dst_begin())->Count); + } + } + } + } + FileCoverages.push_back(std::make_pair(CoveragePath, FileCoverage)); + } + + // FIXME: There is no way to detect calls given current instrumentation. + if (Options.FuncCoverage) + printFuncCoverage(InfoOS); + printFileCoverage(InfoOS); +} + +/// printFunctionSummary - Print function and block summary. +void FileInfo::printFunctionSummary(raw_ostream &OS, + const FunctionVector &Funcs) const { + for (const GCOVFunction *Func : Funcs) { + uint64_t EntryCount = Func->getEntryCount(); + uint32_t BlocksExec = 0; + for (const GCOVBlock &Block : Func->blocks()) + if (Block.getNumDstEdges() && Block.getCount()) + ++BlocksExec; + + OS << "function " << Func->getName() << " called " << EntryCount + << " returned " << safeDiv(Func->getExitCount() * 100, EntryCount) + << "% blocks executed " + << safeDiv(BlocksExec * 100, Func->getNumBlocks() - 1) << "%\n"; + } +} + +/// printBlockInfo - Output counts for each block. +void FileInfo::printBlockInfo(raw_ostream &OS, const GCOVBlock &Block, + uint32_t LineIndex, uint32_t &BlockNo) const { + if (Block.getCount() == 0) + OS << " $$$$$:"; + else + OS << format("%9" PRIu64 ":", Block.getCount()); + OS << format("%5u-block %2u\n", LineIndex + 1, BlockNo++); +} + +/// printBranchInfo - Print conditional branch probabilities. +void FileInfo::printBranchInfo(raw_ostream &OS, const GCOVBlock &Block, + GCOVCoverage &Coverage, uint32_t &EdgeNo) { + SmallVector BranchCounts; + uint64_t TotalCounts = 0; + for (const GCOVEdge *Edge : Block.dsts()) { + BranchCounts.push_back(Edge->Count); + TotalCounts += Edge->Count; + if (Block.getCount()) + ++Coverage.BranchesExec; + if (Edge->Count) + ++Coverage.BranchesTaken; + ++Coverage.Branches; + + if (Options.FuncCoverage) { + const GCOVFunction *Function = &Block.getParent(); + GCOVCoverage &FuncCoverage = FuncCoverages.find(Function)->second; + if (Block.getCount()) + ++FuncCoverage.BranchesExec; + if (Edge->Count) + ++FuncCoverage.BranchesTaken; + ++FuncCoverage.Branches; + } + } + + for (uint64_t N : BranchCounts) + OS << format("branch %2u ", EdgeNo++) + << formatBranchInfo(Options, N, TotalCounts) << "\n"; +} + +/// printUncondBranchInfo - Print unconditional branch probabilities. +void FileInfo::printUncondBranchInfo(raw_ostream &OS, uint32_t &EdgeNo, + uint64_t Count) const { + OS << format("unconditional %2u ", EdgeNo++) + << formatBranchInfo(Options, Count, Count) << "\n"; +} + +// printCoverage - Print generic coverage info used by both printFuncCoverage +// and printFileCoverage. +void FileInfo::printCoverage(raw_ostream &OS, + const GCOVCoverage &Coverage) const { + OS << format("Lines executed:%.2f%% of %u\n", + double(Coverage.LinesExec) * 100 / Coverage.LogicalLines, + Coverage.LogicalLines); + if (Options.BranchInfo) { + if (Coverage.Branches) { + OS << format("Branches executed:%.2f%% of %u\n", + double(Coverage.BranchesExec) * 100 / Coverage.Branches, + Coverage.Branches); + OS << format("Taken at least once:%.2f%% of %u\n", + double(Coverage.BranchesTaken) * 100 / Coverage.Branches, + Coverage.Branches); + } else { + OS << "No branches\n"; + } + OS << "No calls\n"; // to be consistent with gcov + } +} + +// printFuncCoverage - Print per-function coverage info. +void FileInfo::printFuncCoverage(raw_ostream &OS) const { + for (const auto &FC : FuncCoverages) { + const GCOVCoverage &Coverage = FC.second; + OS << "Function '" << Coverage.Name << "'\n"; + printCoverage(OS, Coverage); + OS << "\n"; + } +} + +// printFileCoverage - Print per-file coverage info. +void FileInfo::printFileCoverage(raw_ostream &OS) const { + for (const auto &FC : FileCoverages) { + const std::string &Filename = FC.first; + const GCOVCoverage &Coverage = FC.second; + OS << "File '" << Coverage.Name << "'\n"; + printCoverage(OS, Coverage); + if (!Options.NoOutput) + OS << Coverage.Name << ":creating '" << Filename << "'\n"; + OS << "\n"; + } +} diff --git a/tools/llvm-cov/gcov.cpp b/tools/llvm-cov/gcov.cpp index 4df7f015fd1..7776f2aa9a6 100644 --- a/tools/llvm-cov/gcov.cpp +++ b/tools/llvm-cov/gcov.cpp @@ -11,11 +11,11 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ProfileData/GCOV.h" #include "llvm/ADT/SmallString.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" -#include "llvm/Support/GCOV.h" #include "llvm/Support/Path.h" #include using namespace llvm; -- cgit v1.2.3 From 4fedc84270af2e8925439152e8c324487d01c8d7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Nov 2017 21:08:13 +0000 Subject: [SimplifyCFG] When merging conditional stores, don't count the store we're merging against the PHINodeFoldingThreshold Merging conditional stores tries to check to see if the code is if convertible after the store is moved. But the store hasn't been moved yet so its being counted against the threshold. The patch adds 1 to the threshold comparison to make sure we don't count the store. I've adjusted a test to use a lower threshold to ensure we still do that conversion with the lower threshold. Differential Revision: https://reviews.llvm.org/D39570 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317368 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Utils/SimplifyCFG.cpp | 4 +++- test/Transforms/SimplifyCFG/merge-cond-stores-2.ll | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 3c4dae92ebf..e0045e9f48a 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -2901,7 +2901,9 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB, else return false; } - return N <= PHINodeFoldingThreshold; + // The store we want to merge is counted in N, so add 1 to make sure + // we're counting the instructions that would be left. + return N <= (PHINodeFoldingThreshold + 1); }; if (!MergeCondStoresAggressively && diff --git a/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll b/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll index a2b94038001..a2ca63d0a2d 100644 --- a/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll +++ b/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S < %s -simplifycfg -simplifycfg-merge-cond-stores=true -simplifycfg-merge-cond-stores-aggressively=false -phi-node-folding-threshold=2 | FileCheck %s +; RUN: opt -S < %s -simplifycfg -simplifycfg-merge-cond-stores=true -simplifycfg-merge-cond-stores-aggressively=false -phi-node-folding-threshold=1 | FileCheck %s target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "armv7--linux-gnueabihf" -- cgit v1.2.3 From 9cf32a0f1d22c5ece8a581e98166ddac8a6e61a7 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Fri, 3 Nov 2017 21:30:06 +0000 Subject: Revert r317046, "Object: Move some code from ELF.h into ELF.cpp." This change resulted in a measured 1.5-2% perf regression linking chrome. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317371 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Object/ELF.h | 263 ++++++++++++++++++++++++++++++++++++++++++++++ lib/Object/ELF.cpp | 263 ---------------------------------------------- 2 files changed, 263 insertions(+), 263 deletions(-) diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h index 92fb46e8e93..c24b6310465 100644 --- a/include/llvm/Object/ELF.h +++ b/include/llvm/Object/ELF.h @@ -204,6 +204,46 @@ getExtendedSymbolTableIndex(const typename ELFT::Sym *Sym, return ShndxTable[Index]; } +template +Expected +ELFFile::getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms, + ArrayRef ShndxTable) const { + uint32_t Index = Sym->st_shndx; + if (Index == ELF::SHN_XINDEX) { + auto ErrorOrIndex = getExtendedSymbolTableIndex( + Sym, Syms.begin(), ShndxTable); + if (!ErrorOrIndex) + return ErrorOrIndex.takeError(); + return *ErrorOrIndex; + } + if (Index == ELF::SHN_UNDEF || Index >= ELF::SHN_LORESERVE) + return 0; + return Index; +} + +template +Expected +ELFFile::getSection(const Elf_Sym *Sym, const Elf_Shdr *SymTab, + ArrayRef ShndxTable) const { + auto SymsOrErr = symbols(SymTab); + if (!SymsOrErr) + return SymsOrErr.takeError(); + return getSection(Sym, *SymsOrErr, ShndxTable); +} + +template +Expected +ELFFile::getSection(const Elf_Sym *Sym, Elf_Sym_Range Symbols, + ArrayRef ShndxTable) const { + auto IndexOrErr = getSectionIndex(Sym, Symbols, ShndxTable); + if (!IndexOrErr) + return IndexOrErr.takeError(); + uint32_t Index = *IndexOrErr; + if (Index == 0) + return nullptr; + return getSection(Index); +} + template inline Expected getSymbol(typename ELFT::SymRange Symbols, uint32_t Index) { @@ -212,6 +252,15 @@ getSymbol(typename ELFT::SymRange Symbols, uint32_t Index) { return &Symbols[Index]; } +template +Expected +ELFFile::getSymbol(const Elf_Shdr *Sec, uint32_t Index) const { + auto SymtabOrErr = symbols(Sec); + if (!SymtabOrErr) + return SymtabOrErr.takeError(); + return object::getSymbol(*SymtabOrErr, Index); +} + template template Expected> @@ -232,6 +281,119 @@ ELFFile::getSectionContentsAsArray(const Elf_Shdr *Sec) const { return makeArrayRef(Start, Size / sizeof(T)); } +template +Expected> +ELFFile::getSectionContents(const Elf_Shdr *Sec) const { + return getSectionContentsAsArray(Sec); +} + +template +StringRef ELFFile::getRelocationTypeName(uint32_t Type) const { + return getELFRelocationTypeName(getHeader()->e_machine, Type); +} + +template +void ELFFile::getRelocationTypeName(uint32_t Type, + SmallVectorImpl &Result) const { + if (!isMipsELF64()) { + StringRef Name = getRelocationTypeName(Type); + Result.append(Name.begin(), Name.end()); + } else { + // The Mips N64 ABI allows up to three operations to be specified per + // relocation record. Unfortunately there's no easy way to test for the + // presence of N64 ELFs as they have no special flag that identifies them + // as being N64. We can safely assume at the moment that all Mips + // ELFCLASS64 ELFs are N64. New Mips64 ABIs should provide enough + // information to disambiguate between old vs new ABIs. + uint8_t Type1 = (Type >> 0) & 0xFF; + uint8_t Type2 = (Type >> 8) & 0xFF; + uint8_t Type3 = (Type >> 16) & 0xFF; + + // Concat all three relocation type names. + StringRef Name = getRelocationTypeName(Type1); + Result.append(Name.begin(), Name.end()); + + Name = getRelocationTypeName(Type2); + Result.append(1, '/'); + Result.append(Name.begin(), Name.end()); + + Name = getRelocationTypeName(Type3); + Result.append(1, '/'); + Result.append(Name.begin(), Name.end()); + } +} + +template +Expected +ELFFile::getRelocationSymbol(const Elf_Rel *Rel, + const Elf_Shdr *SymTab) const { + uint32_t Index = Rel->getSymbol(isMips64EL()); + if (Index == 0) + return nullptr; + return getEntry(SymTab, Index); +} + +template +Expected +ELFFile::getSectionStringTable(Elf_Shdr_Range Sections) const { + uint32_t Index = getHeader()->e_shstrndx; + if (Index == ELF::SHN_XINDEX) + Index = Sections[0].sh_link; + + if (!Index) // no section string table. + return ""; + if (Index >= Sections.size()) + return createError("invalid section index"); + return getStringTable(&Sections[Index]); +} + +template ELFFile::ELFFile(StringRef Object) : Buf(Object) {} + +template +Expected> ELFFile::create(StringRef Object) { + if (sizeof(Elf_Ehdr) > Object.size()) + return createError("Invalid buffer"); + return ELFFile(Object); +} + +template +Expected ELFFile::sections() const { + const uintX_t SectionTableOffset = getHeader()->e_shoff; + if (SectionTableOffset == 0) + return ArrayRef(); + + if (getHeader()->e_shentsize != sizeof(Elf_Shdr)) + return createError( + "invalid section header entry size (e_shentsize) in ELF header"); + + const uint64_t FileSize = Buf.size(); + + if (SectionTableOffset + sizeof(Elf_Shdr) > FileSize) + return createError("section header table goes past the end of the file"); + + // Invalid address alignment of section headers + if (SectionTableOffset & (alignof(Elf_Shdr) - 1)) + return createError("invalid alignment of section headers"); + + const Elf_Shdr *First = + reinterpret_cast(base() + SectionTableOffset); + + uintX_t NumSections = getHeader()->e_shnum; + if (NumSections == 0) + NumSections = First->sh_size; + + if (NumSections > UINT64_MAX / sizeof(Elf_Shdr)) + return createError("section table goes past the end of file"); + + const uint64_t SectionTableSize = NumSections * sizeof(Elf_Shdr); + + // Section table goes past end of file! + if (SectionTableOffset + SectionTableSize > FileSize) + return createError("section table goes past the end of file"); + + return makeArrayRef(First, NumSections); +} + template template Expected ELFFile::getEntry(uint32_t Section, @@ -254,6 +416,107 @@ Expected ELFFile::getEntry(const Elf_Shdr *Section, return reinterpret_cast(base() + Pos); } +template +Expected +ELFFile::getSection(uint32_t Index) const { + auto TableOrErr = sections(); + if (!TableOrErr) + return TableOrErr.takeError(); + return object::getSection(*TableOrErr, Index); +} + +template +Expected +ELFFile::getStringTable(const Elf_Shdr *Section) const { + if (Section->sh_type != ELF::SHT_STRTAB) + return createError("invalid sh_type for string table, expected SHT_STRTAB"); + auto V = getSectionContentsAsArray(Section); + if (!V) + return V.takeError(); + ArrayRef Data = *V; + if (Data.empty()) + return createError("empty string table"); + if (Data.back() != '\0') + return createError("string table non-null terminated"); + return StringRef(Data.begin(), Data.size()); +} + +template +Expected> +ELFFile::getSHNDXTable(const Elf_Shdr &Section) const { + auto SectionsOrErr = sections(); + if (!SectionsOrErr) + return SectionsOrErr.takeError(); + return getSHNDXTable(Section, *SectionsOrErr); +} + +template +Expected> +ELFFile::getSHNDXTable(const Elf_Shdr &Section, + Elf_Shdr_Range Sections) const { + assert(Section.sh_type == ELF::SHT_SYMTAB_SHNDX); + auto VOrErr = getSectionContentsAsArray(&Section); + if (!VOrErr) + return VOrErr.takeError(); + ArrayRef V = *VOrErr; + auto SymTableOrErr = object::getSection(Sections, Section.sh_link); + if (!SymTableOrErr) + return SymTableOrErr.takeError(); + const Elf_Shdr &SymTable = **SymTableOrErr; + if (SymTable.sh_type != ELF::SHT_SYMTAB && + SymTable.sh_type != ELF::SHT_DYNSYM) + return createError("invalid sh_type"); + if (V.size() != (SymTable.sh_size / sizeof(Elf_Sym))) + return createError("invalid section contents size"); + return V; +} + +template +Expected +ELFFile::getStringTableForSymtab(const Elf_Shdr &Sec) const { + auto SectionsOrErr = sections(); + if (!SectionsOrErr) + return SectionsOrErr.takeError(); + return getStringTableForSymtab(Sec, *SectionsOrErr); +} + +template +Expected +ELFFile::getStringTableForSymtab(const Elf_Shdr &Sec, + Elf_Shdr_Range Sections) const { + + if (Sec.sh_type != ELF::SHT_SYMTAB && Sec.sh_type != ELF::SHT_DYNSYM) + return createError( + "invalid sh_type for symbol table, expected SHT_SYMTAB or SHT_DYNSYM"); + auto SectionOrErr = object::getSection(Sections, Sec.sh_link); + if (!SectionOrErr) + return SectionOrErr.takeError(); + return getStringTable(*SectionOrErr); +} + +template +Expected +ELFFile::getSectionName(const Elf_Shdr *Section) const { + auto SectionsOrErr = sections(); + if (!SectionsOrErr) + return SectionsOrErr.takeError(); + auto Table = getSectionStringTable(*SectionsOrErr); + if (!Table) + return Table.takeError(); + return getSectionName(Section, *Table); +} + +template +Expected ELFFile::getSectionName(const Elf_Shdr *Section, + StringRef DotShstrtab) const { + uint32_t Offset = Section->sh_name; + if (Offset == 0) + return StringRef(); + if (Offset >= DotShstrtab.size()) + return createError("invalid string offset"); + return StringRef(DotShstrtab.data() + Offset); +} + /// This function returns the hash value for a symbol in the .dynsym section /// Name of the API remains consistent as specified in the libelf /// REF : http://www.sco.com/developers/gabi/latest/ch5.dynamic.html#hash diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp index ef8c844a66f..c72a1258c1e 100644 --- a/lib/Object/ELF.cpp +++ b/lib/Object/ELF.cpp @@ -214,269 +214,6 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) { } } -template -Expected -ELFFile::getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms, - ArrayRef ShndxTable) const { - uint32_t Index = Sym->st_shndx; - if (Index == ELF::SHN_XINDEX) { - auto ErrorOrIndex = getExtendedSymbolTableIndex( - Sym, Syms.begin(), ShndxTable); - if (!ErrorOrIndex) - return ErrorOrIndex.takeError(); - return *ErrorOrIndex; - } - if (Index == ELF::SHN_UNDEF || Index >= ELF::SHN_LORESERVE) - return 0; - return Index; -} - -template -Expected -ELFFile::getSection(const Elf_Sym *Sym, const Elf_Shdr *SymTab, - ArrayRef ShndxTable) const { - auto SymsOrErr = symbols(SymTab); - if (!SymsOrErr) - return SymsOrErr.takeError(); - return getSection(Sym, *SymsOrErr, ShndxTable); -} - -template -Expected -ELFFile::getSection(const Elf_Sym *Sym, Elf_Sym_Range Symbols, - ArrayRef ShndxTable) const { - auto IndexOrErr = getSectionIndex(Sym, Symbols, ShndxTable); - if (!IndexOrErr) - return IndexOrErr.takeError(); - uint32_t Index = *IndexOrErr; - if (Index == 0) - return nullptr; - return getSection(Index); -} - -template -Expected -ELFFile::getSymbol(const Elf_Shdr *Sec, uint32_t Index) const { - auto SymtabOrErr = symbols(Sec); - if (!SymtabOrErr) - return SymtabOrErr.takeError(); - return object::getSymbol(*SymtabOrErr, Index); -} - -template -Expected> -ELFFile::getSectionContents(const Elf_Shdr *Sec) const { - return getSectionContentsAsArray(Sec); -} - -template -StringRef ELFFile::getRelocationTypeName(uint32_t Type) const { - return getELFRelocationTypeName(getHeader()->e_machine, Type); -} - -template -void ELFFile::getRelocationTypeName(uint32_t Type, - SmallVectorImpl &Result) const { - if (!isMipsELF64()) { - StringRef Name = getRelocationTypeName(Type); - Result.append(Name.begin(), Name.end()); - } else { - // The Mips N64 ABI allows up to three operations to be specified per - // relocation record. Unfortunately there's no easy way to test for the - // presence of N64 ELFs as they have no special flag that identifies them - // as being N64. We can safely assume at the moment that all Mips - // ELFCLASS64 ELFs are N64. New Mips64 ABIs should provide enough - // information to disambiguate between old vs new ABIs. - uint8_t Type1 = (Type >> 0) & 0xFF; - uint8_t Type2 = (Type >> 8) & 0xFF; - uint8_t Type3 = (Type >> 16) & 0xFF; - - // Concat all three relocation type names. - StringRef Name = getRelocationTypeName(Type1); - Result.append(Name.begin(), Name.end()); - - Name = getRelocationTypeName(Type2); - Result.append(1, '/'); - Result.append(Name.begin(), Name.end()); - - Name = getRelocationTypeName(Type3); - Result.append(1, '/'); - Result.append(Name.begin(), Name.end()); - } -} - -template -Expected -ELFFile::getRelocationSymbol(const Elf_Rel *Rel, - const Elf_Shdr *SymTab) const { - uint32_t Index = Rel->getSymbol(isMips64EL()); - if (Index == 0) - return nullptr; - return getEntry(SymTab, Index); -} - -template -Expected -ELFFile::getSectionStringTable(Elf_Shdr_Range Sections) const { - uint32_t Index = getHeader()->e_shstrndx; - if (Index == ELF::SHN_XINDEX) - Index = Sections[0].sh_link; - - if (!Index) // no section string table. - return ""; - if (Index >= Sections.size()) - return createError("invalid section index"); - return getStringTable(&Sections[Index]); -} - -template ELFFile::ELFFile(StringRef Object) : Buf(Object) {} - -template -Expected> ELFFile::create(StringRef Object) { - if (sizeof(Elf_Ehdr) > Object.size()) - return createError("Invalid buffer"); - return ELFFile(Object); -} - -template -Expected ELFFile::sections() const { - const uintX_t SectionTableOffset = getHeader()->e_shoff; - if (SectionTableOffset == 0) - return ArrayRef(); - - if (getHeader()->e_shentsize != sizeof(Elf_Shdr)) - return createError( - "invalid section header entry size (e_shentsize) in ELF header"); - - const uint64_t FileSize = Buf.size(); - - if (SectionTableOffset + sizeof(Elf_Shdr) > FileSize) - return createError("section header table goes past the end of the file"); - - // Invalid address alignment of section headers - if (SectionTableOffset & (alignof(Elf_Shdr) - 1)) - return createError("invalid alignment of section headers"); - - const Elf_Shdr *First = - reinterpret_cast(base() + SectionTableOffset); - - uintX_t NumSections = getHeader()->e_shnum; - if (NumSections == 0) - NumSections = First->sh_size; - - if (NumSections > UINT64_MAX / sizeof(Elf_Shdr)) - return createError("section table goes past the end of file"); - - const uint64_t SectionTableSize = NumSections * sizeof(Elf_Shdr); - - // Section table goes past end of file! - if (SectionTableOffset + SectionTableSize > FileSize) - return createError("section table goes past the end of file"); - - return makeArrayRef(First, NumSections); -} - -template -Expected -ELFFile::getSection(uint32_t Index) const { - auto TableOrErr = sections(); - if (!TableOrErr) - return TableOrErr.takeError(); - return object::getSection(*TableOrErr, Index); -} - -template -Expected -ELFFile::getStringTable(const Elf_Shdr *Section) const { - if (Section->sh_type != ELF::SHT_STRTAB) - return createError("invalid sh_type for string table, expected SHT_STRTAB"); - auto V = getSectionContentsAsArray(Section); - if (!V) - return V.takeError(); - ArrayRef Data = *V; - if (Data.empty()) - return createError("empty string table"); - if (Data.back() != '\0') - return createError("string table non-null terminated"); - return StringRef(Data.begin(), Data.size()); -} - -template -Expected> -ELFFile::getSHNDXTable(const Elf_Shdr &Section) const { - auto SectionsOrErr = sections(); - if (!SectionsOrErr) - return SectionsOrErr.takeError(); - return getSHNDXTable(Section, *SectionsOrErr); -} - -template -Expected> -ELFFile::getSHNDXTable(const Elf_Shdr &Section, - Elf_Shdr_Range Sections) const { - assert(Section.sh_type == ELF::SHT_SYMTAB_SHNDX); - auto VOrErr = getSectionContentsAsArray(&Section); - if (!VOrErr) - return VOrErr.takeError(); - ArrayRef V = *VOrErr; - auto SymTableOrErr = object::getSection(Sections, Section.sh_link); - if (!SymTableOrErr) - return SymTableOrErr.takeError(); - const Elf_Shdr &SymTable = **SymTableOrErr; - if (SymTable.sh_type != ELF::SHT_SYMTAB && - SymTable.sh_type != ELF::SHT_DYNSYM) - return createError("invalid sh_type"); - if (V.size() != (SymTable.sh_size / sizeof(Elf_Sym))) - return createError("invalid section contents size"); - return V; -} - -template -Expected -ELFFile::getStringTableForSymtab(const Elf_Shdr &Sec) const { - auto SectionsOrErr = sections(); - if (!SectionsOrErr) - return SectionsOrErr.takeError(); - return getStringTableForSymtab(Sec, *SectionsOrErr); -} - -template -Expected -ELFFile::getStringTableForSymtab(const Elf_Shdr &Sec, - Elf_Shdr_Range Sections) const { - - if (Sec.sh_type != ELF::SHT_SYMTAB && Sec.sh_type != ELF::SHT_DYNSYM) - return createError( - "invalid sh_type for symbol table, expected SHT_SYMTAB or SHT_DYNSYM"); - auto SectionOrErr = object::getSection(Sections, Sec.sh_link); - if (!SectionOrErr) - return SectionOrErr.takeError(); - return getStringTable(*SectionOrErr); -} - -template -Expected -ELFFile::getSectionName(const Elf_Shdr *Section) const { - auto SectionsOrErr = sections(); - if (!SectionsOrErr) - return SectionsOrErr.takeError(); - auto Table = getSectionStringTable(*SectionsOrErr); - if (!Table) - return Table.takeError(); - return getSectionName(Section, *Table); -} - -template -Expected ELFFile::getSectionName(const Elf_Shdr *Section, - StringRef DotShstrtab) const { - uint32_t Offset = Section->sh_name; - if (Offset == 0) - return StringRef(); - if (Offset >= DotShstrtab.size()) - return createError("invalid string offset"); - return StringRef(DotShstrtab.data() + Offset); -} - template Expected> ELFFile::android_relas(const Elf_Shdr *Sec) const { -- cgit v1.2.3 From ce8f24e6d75e12371b723070fbcd546ebb01598b Mon Sep 17 00:00:00 2001 From: Kevin Enderby Date: Fri, 3 Nov 2017 21:32:44 +0000 Subject: Fix a crash in llvm-objdump when printing a bad x86_64 relocation in a Mach-O file with a bad section number. rdar://35207539 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317373 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../X86/Inputs/macho-invalid-reloc-section-index | Bin 0 -> 2768 bytes test/tools/llvm-objdump/X86/malformed-machos.test | 3 +++ tools/llvm-objdump/llvm-objdump.cpp | 15 +++++++++++++-- 3 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 test/tools/llvm-objdump/X86/Inputs/macho-invalid-reloc-section-index diff --git a/test/tools/llvm-objdump/X86/Inputs/macho-invalid-reloc-section-index b/test/tools/llvm-objdump/X86/Inputs/macho-invalid-reloc-section-index new file mode 100644 index 00000000000..a9d0b48449b Binary files /dev/null and b/test/tools/llvm-objdump/X86/Inputs/macho-invalid-reloc-section-index differ diff --git a/test/tools/llvm-objdump/X86/malformed-machos.test b/test/tools/llvm-objdump/X86/malformed-machos.test index 292666a3725..e29df464a4e 100644 --- a/test/tools/llvm-objdump/X86/malformed-machos.test +++ b/test/tools/llvm-objdump/X86/malformed-machos.test @@ -66,3 +66,6 @@ INVALID-SYMBOL-LIB_ORDINAL: macho-invalid-symbol-lib_ordinal': truncated or malf RUN: not llvm-objdump -macho -objc-meta-data %p/Inputs/macho-invalid-bind-entry 2>&1 | FileCheck -check-prefix INVALID-BIND-ENTRY %s INVALID-BIND-ENTRY: macho-invalid-bind-entry': truncated or malformed object (for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB bad library ordinal: 83 (max 0) for opcode at: 0x0) + +RUN: llvm-objdump -macho -r %p/Inputs/macho-invalid-reloc-section-index | FileCheck -check-prefix INVALID-RELOC-SECTION-INDEX %s +INVALID-RELOC-SECTION-INDEX: 0000000000000021 X86_64_RELOC_UNSIGNED 8388613 (?,?) diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp index d80f1cb049d..02eaa89f088 100644 --- a/tools/llvm-objdump/llvm-objdump.cpp +++ b/tools/llvm-objdump/llvm-objdump.cpp @@ -865,8 +865,19 @@ static void printRelocationTargetName(const MachOObjectFile *O, } else { section_iterator SI = O->section_begin(); // Adjust for the fact that sections are 1-indexed. - advance(SI, Val - 1); - SI->getName(S); + if (Val == 0) { + fmt << "0 (?,?)"; + return; + } + uint32_t i = Val - 1; + while (i != 0 && SI != O->section_end()) { + i--; + advance(SI, 1); + } + if (SI == O->section_end()) + fmt << Val << " (?,?)"; + else + SI->getName(S); } fmt << S; -- cgit v1.2.3 From cdc57825ed64b0995a34dcbf9f330e8b2d5cd5bd Mon Sep 17 00:00:00 2001 From: Sean Fertile Date: Fri, 3 Nov 2017 21:45:55 +0000 Subject: [LTO][ThinLTO] Use the linker resolutions to mark global values as dso_local. Now that we have a way to mark GlobalValues as local we can use the symbol resolutions that the linker plugin provides as part of lto/thinlto link step to refine the compilers view on what symbols will end up being local. Differential Revision: https://reviews.llvm.org/D35702 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317374 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/ModuleSummaryIndex.h | 12 +++++++-- include/llvm/IR/ModuleSummaryIndexYAML.h | 8 +++--- lib/Analysis/ModuleSummaryAnalysis.cpp | 9 ++++--- lib/Bitcode/Reader/BitcodeReader.cpp | 4 ++- lib/Bitcode/Writer/BitcodeWriter.cpp | 2 ++ lib/LTO/LTO.cpp | 21 +++++++++++---- lib/Transforms/Utils/FunctionImportUtils.cpp | 17 ++++++++++++ test/Bitcode/thinlto-summary-local-5.0.ll | 22 +++++++++++++++ test/Bitcode/thinlto-summary-local-5.0.ll.bc | Bin 0 -> 1028 bytes test/LTO/Resolution/X86/comdat-mixed-lto.ll | 2 +- test/LTO/Resolution/X86/comdat.ll | 4 +-- test/LTO/Resolution/X86/commons.ll | 2 +- test/ThinLTO/X86/deadstrip.ll | 30 +++++++++++++-------- test/ThinLTO/X86/funcimport2.ll | 4 +-- test/ThinLTO/X86/internalize.ll | 9 ++++--- test/ThinLTO/X86/reference_non_importable.ll | 2 +- test/Transforms/LowerTypeTests/import-unsat.ll | 1 + .../PGOProfile/thinlto_samplepgo_icp2.ll | 2 +- test/Transforms/WholeProgramDevirt/import-indir.ll | 1 + 19 files changed, 115 insertions(+), 37 deletions(-) create mode 100644 test/Bitcode/thinlto-summary-local-5.0.ll create mode 100644 test/Bitcode/thinlto-summary-local-5.0.ll.bc diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h index 2d664f41e3c..b1e58a2a0d9 100644 --- a/include/llvm/IR/ModuleSummaryIndex.h +++ b/include/llvm/IR/ModuleSummaryIndex.h @@ -148,11 +148,15 @@ public: /// In combined summary, indicate that the global value is live. unsigned Live : 1; + /// Indicates that the linker resolved the symbol to a definition from + /// within the same linkage unit. + unsigned DSOLocal : 1; + /// Convenience Constructors explicit GVFlags(GlobalValue::LinkageTypes Linkage, - bool NotEligibleToImport, bool Live) + bool NotEligibleToImport, bool Live, bool IsLocal) : Linkage(Linkage), NotEligibleToImport(NotEligibleToImport), - Live(Live) {} + Live(Live), DSOLocal(IsLocal) {} }; private: @@ -229,6 +233,10 @@ public: void setLive(bool Live) { Flags.Live = Live; } + void setDSOLocal(bool Local) { Flags.DSOLocal = Local; } + + bool isDSOLocal() const { return Flags.DSOLocal; } + /// Flag that this global value cannot be imported. void setNotEligibleToImport() { Flags.NotEligibleToImport = true; } diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h index 2f9990ca03d..4687f2d53e7 100644 --- a/include/llvm/IR/ModuleSummaryIndexYAML.h +++ b/include/llvm/IR/ModuleSummaryIndexYAML.h @@ -135,7 +135,7 @@ template <> struct MappingTraits { struct FunctionSummaryYaml { unsigned Linkage; - bool NotEligibleToImport, Live; + bool NotEligibleToImport, Live, IsLocal; std::vector TypeTests; std::vector TypeTestAssumeVCalls, TypeCheckedLoadVCalls; @@ -177,6 +177,7 @@ template <> struct MappingTraits { io.mapOptional("Linkage", summary.Linkage); io.mapOptional("NotEligibleToImport", summary.NotEligibleToImport); io.mapOptional("Live", summary.Live); + io.mapOptional("Local", summary.IsLocal); io.mapOptional("TypeTests", summary.TypeTests); io.mapOptional("TypeTestAssumeVCalls", summary.TypeTestAssumeVCalls); io.mapOptional("TypeCheckedLoadVCalls", summary.TypeCheckedLoadVCalls); @@ -211,7 +212,7 @@ template <> struct CustomMappingTraits { Elem.SummaryList.push_back(llvm::make_unique( GlobalValueSummary::GVFlags( static_cast(FSum.Linkage), - FSum.NotEligibleToImport, FSum.Live), + FSum.NotEligibleToImport, FSum.Live, FSum.IsLocal), 0, FunctionSummary::FFlags{}, ArrayRef{}, ArrayRef{}, std::move(FSum.TypeTests), std::move(FSum.TypeTestAssumeVCalls), @@ -228,7 +229,8 @@ template <> struct CustomMappingTraits { FSums.push_back(FunctionSummaryYaml{ FSum->flags().Linkage, static_cast(FSum->flags().NotEligibleToImport), - static_cast(FSum->flags().Live), FSum->type_tests(), + static_cast(FSum->flags().Live), + static_cast(FSum->flags().DSOLocal), FSum->type_tests(), FSum->type_test_assume_vcalls(), FSum->type_checked_load_vcalls(), FSum->type_test_assume_const_vcalls(), FSum->type_checked_load_const_vcalls()}); diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp index afd575e7273..82db09ca97b 100644 --- a/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -303,7 +303,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, // FIXME: refactor this to use the same code that inliner is using. F.isVarArg(); GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport, - /* Live = */ false); + /* Live = */ false, F.isDSOLocal()); FunctionSummary::FFlags FunFlags{ F.hasFnAttribute(Attribute::ReadNone), F.hasFnAttribute(Attribute::ReadOnly), @@ -329,7 +329,7 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V, findRefEdges(Index, &V, RefEdges, Visited); bool NonRenamableLocal = isNonRenamableLocal(V); GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal, - /* Live = */ false); + /* Live = */ false, V.isDSOLocal()); auto GVarSummary = llvm::make_unique(Flags, RefEdges.takeVector()); if (NonRenamableLocal) @@ -342,7 +342,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A, DenseSet &CantBePromoted) { bool NonRenamableLocal = isNonRenamableLocal(A); GlobalValueSummary::GVFlags Flags(A.getLinkage(), NonRenamableLocal, - /* Live = */ false); + /* Live = */ false, A.isDSOLocal()); auto AS = llvm::make_unique(Flags); auto *Aliasee = A.getBaseObject(); auto *AliaseeSummary = Index.getGlobalValueSummary(*Aliasee); @@ -410,7 +410,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( assert(GV->isDeclaration() && "Def in module asm already has definition"); GlobalValueSummary::GVFlags GVFlags(GlobalValue::InternalLinkage, /* NotEligibleToImport = */ true, - /* Live = */ true); + /* Live = */ true, + /* Local */ GV->isDSOLocal()); CantBePromoted.insert(GlobalValue::getGUID(Name)); // Create the appropriate summary type. if (Function *F = dyn_cast(GV)) { diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index c2272260f44..d0f11db8f61 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -889,7 +889,9 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags, // to work correctly on earlier versions, we must conservatively treat all // values as live. bool Live = (RawFlags & 0x2) || Version < 3; - return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live); + bool Local = (RawFlags & 0x4); + + return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live, Local); } static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) { diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index 1e491aa066e..c5d376c9426 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -955,6 +955,8 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) { RawFlags |= Flags.NotEligibleToImport; // bool RawFlags |= (Flags.Live << 1); + RawFlags |= (Flags.DSOLocal << 2); + // Linkage don't need to be remapped at that time for the summary. Any future // change to the getEncodedLinkage() function will need to be taken into // account here as well. diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp index 017dd201f9c..9c737795b5a 100644 --- a/lib/LTO/LTO.cpp +++ b/lib/LTO/LTO.cpp @@ -630,6 +630,9 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef Syms, NonPrevailingComdats.insert(GV->getComdat()); cast(GV)->setComdat(nullptr); } + + // Set the 'local' flag based on the linker resolution for this symbol. + GV->setDSOLocal(Res.FinalDefinitionInLinkageUnit); } // Common resolution: collect the maximum size/alignment over all commons. // We also record if we see an instance of a common as prevailing, so that @@ -643,7 +646,6 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef Syms, CommonRes.Prevailing |= Res.Prevailing; } - // FIXME: use proposed local attribute for FinalDefinitionInLinkageUnit. } if (!M.getComdatSymbolTable().empty()) for (GlobalValue &GV : M.global_values()) @@ -698,10 +700,10 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms, assert(ResI != ResE); SymbolResolution Res = *ResI++; - if (Res.Prevailing) { - if (!Sym.getIRName().empty()) { - auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier( - Sym.getIRName(), GlobalValue::ExternalLinkage, "")); + if (!Sym.getIRName().empty()) { + auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier( + Sym.getIRName(), GlobalValue::ExternalLinkage, "")); + if (Res.Prevailing) { ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier(); // For linker redefined symbols (via --wrap or --defsym) we want to @@ -713,6 +715,15 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms, GUID, BM.getModuleIdentifier())) S->setLinkage(GlobalValue::WeakAnyLinkage); } + + // If the linker resolved the symbol to a local definition then mark it + // as local in the summary for the module we are adding. + if (Res.FinalDefinitionInLinkageUnit) { + if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( + GUID, BM.getModuleIdentifier())) { + S->setDSOLocal(true); + } + } } } diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp index fbb61ac1ae9..2e6fc4e8482 100644 --- a/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -203,6 +203,23 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV, } void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { + + // Check the summaries to see if the symbol gets resolved to a known local + // definition. + if (GV.hasName()) { + ValueInfo VI = ImportIndex.getValueInfo(GV.getGUID()); + if (VI) { + // Need to check all summaries are local in case of hash collisions. + bool IsLocal = VI.getSummaryList().size() && + llvm::all_of(VI.getSummaryList(), + [](const std::unique_ptr &Summary) { + return Summary->isDSOLocal(); + }); + if (IsLocal) + GV.setDSOLocal(true); + } + } + bool DoPromote = false; if (GV.hasLocalLinkage() && ((DoPromote = shouldPromoteLocalToGlobal(&GV)) || isPerformingImport())) { diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll b/test/Bitcode/thinlto-summary-local-5.0.ll new file mode 100644 index 00000000000..cbc48d23df3 --- /dev/null +++ b/test/Bitcode/thinlto-summary-local-5.0.ll @@ -0,0 +1,22 @@ +; Bitcode compatibility test for dso_local flag in thin-lto summaries. +; Checks that older bitcode summaries without the dso_local op are still +; properly parsed and don't set GlobalValues as dso_local. + +; RUN: llvm-dis < %s.bc | FileCheck %s +; RUN: llvm-bcanalyzer -dump %s.bc | FileCheck %s --check-prefix=BCAN + +define void @foo() { +;CHECK-DAG:define void @foo() + ret void +} + +@bar = global i32 0 +;CHECK-DAG: @bar = global i32 0 + +@baz = alias i32, i32* @bar +;CHECK-DAG: @bar = global i32 0 + +;BCAN: +;BCAN-NEXT: +;BCAN-NEXT: diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll.bc b/test/Bitcode/thinlto-summary-local-5.0.ll.bc new file mode 100644 index 00000000000..8dc7ca0a74b Binary files /dev/null and b/test/Bitcode/thinlto-summary-local-5.0.ll.bc differ diff --git a/test/LTO/Resolution/X86/comdat-mixed-lto.ll b/test/LTO/Resolution/X86/comdat-mixed-lto.ll index f6ee22e4161..d6022c64351 100644 --- a/test/LTO/Resolution/X86/comdat-mixed-lto.ll +++ b/test/LTO/Resolution/X86/comdat-mixed-lto.ll @@ -17,7 +17,7 @@ ; would clash with the copy from this module. ; RUN: llvm-dis %t3.0.0.preopt.bc -o - | FileCheck %s ; CHECK: define internal void @__cxx_global_var_init() section ".text.startup" { -; CHECK: define available_externally void @testglobfunc() section ".text.startup" { +; CHECK: define available_externally dso_local void @testglobfunc() section ".text.startup" { ; ModuleID = 'comdat-mixed-lto.o' source_filename = "comdat-mixed-lto.cpp" diff --git a/test/LTO/Resolution/X86/comdat.ll b/test/LTO/Resolution/X86/comdat.ll index 60d082b3e0f..94f28384231 100644 --- a/test/LTO/Resolution/X86/comdat.ll +++ b/test/LTO/Resolution/X86/comdat.ll @@ -70,14 +70,14 @@ bb11: ; CHECK-DAG: @a23 = alias i32 (i8*), i32 (i8*)* @f1.2{{$}} ; CHECK-DAG: @a24 = alias i16, bitcast (i32 (i8*)* @f1.2 to i16*) -; CHECK: define weak_odr i32 @f1(i8*) comdat($c1) { +; CHECK: define weak_odr dso_local i32 @f1(i8*) comdat($c1) { ; CHECK-NEXT: bb10: ; CHECK-NEXT: br label %bb11{{$}} ; CHECK: bb11: ; CHECK-NEXT: ret i32 42 ; CHECK-NEXT: } -; CHECK: define internal i32 @f1.2(i8* %this) comdat($c2) { +; CHECK: define internal dso_local i32 @f1.2(i8* %this) comdat($c2) { ; CHECK-NEXT: bb20: ; CHECK-NEXT: store i8* %this, i8** null ; CHECK-NEXT: br label %bb21 diff --git a/test/LTO/Resolution/X86/commons.ll b/test/LTO/Resolution/X86/commons.ll index 28bf1ada4a8..8adfb87d6ed 100644 --- a/test/LTO/Resolution/X86/commons.ll +++ b/test/LTO/Resolution/X86/commons.ll @@ -4,7 +4,7 @@ ; RUN: llvm-dis -o - %t.out.0.0.preopt.bc | FileCheck %s ; A strong definition should override the common -; CHECK: @x = global i32 42, align 4 +; CHECK: @x = dso_local global i32 42, align 4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/test/ThinLTO/X86/deadstrip.ll b/test/ThinLTO/X86/deadstrip.ll index c19ccb01be3..90de3bb9a32 100644 --- a/test/ThinLTO/X86/deadstrip.ll +++ b/test/ThinLTO/X86/deadstrip.ll @@ -18,8 +18,8 @@ ; RUN: -r %t2.bc,_boo,pl \ ; RUN: -r %t2.bc,_dead_func,pl \ ; RUN: -r %t2.bc,_another_dead_func,pl -; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s -; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=CHECK2 +; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s --check-prefix=LTO2 +; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=LTO2-CHECK2 ; RUN: llvm-nm %t.out.1 | FileCheck %s --check-prefix=CHECK2-NM ; RUN: llvm-bcanalyzer -dump %t.out.index.bc | FileCheck %s --check-prefix=COMBINED @@ -27,14 +27,14 @@ ; COMBINED-DAG: Date: Fri, 3 Nov 2017 21:55:03 +0000 Subject: Invoke salvageDebugInfo from CodeGenPrepare's SinkCast() This preserves the debug info for the cast operation in the original location. rdar://problem/33460652 Reapplied r317340 with the test moved into an ARM-specific directory. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317375 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CodeGenPrepare.cpp | 1 + lib/Transforms/Utils/Local.cpp | 2 +- test/DebugInfo/ARM/salvage-debug-info.ll | 118 +++++++++++++++++++++++++++++++ 3 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 test/DebugInfo/ARM/salvage-debug-info.ll diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 973ddebd987..73f014704b8 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -1171,6 +1171,7 @@ static bool SinkCast(CastInst *CI) { // If we removed all uses, nuke the cast. if (CI->use_empty()) { + salvageDebugInfo(*CI); CI->eraseFromParent(); MadeChange = true; } diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 8c643c93ec4..cb7978f76aa 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -1366,7 +1366,7 @@ void llvm::salvageDebugInfo(Instruction &I) { return MetadataAsValue::get(I.getContext(), ValueAsMetadata::get(V)); }; - if (isa(&I)) { + if (isa(&I) || isa(&I)) { findDbgValues(DbgValues, &I); for (auto *DVI : DbgValues) { // Bitcasts are entirely irrelevant for debug info. Rewrite the dbg.value diff --git a/test/DebugInfo/ARM/salvage-debug-info.ll b/test/DebugInfo/ARM/salvage-debug-info.ll new file mode 100644 index 00000000000..5509b92a5c1 --- /dev/null +++ b/test/DebugInfo/ARM/salvage-debug-info.ll @@ -0,0 +1,118 @@ +; RUN: opt -codegenprepare -S %s -o - | FileCheck %s +; typedef struct info { +; unsigned long long size; +; } info_t; +; extern unsigned p; +; extern unsigned n; +; void f() { +; unsigned int i; +; if (p) { +; info_t *info = (info_t *)p; +; for (i = 0; i < n; i++) +; use(info[i].size); +; } +; } +source_filename = "debug.i" +target datalayout = "e-m:o-p:32:32-i64:64-a:0:32-n32-S128" +target triple = "thumbv7k-apple-ios10.0.0" + +%struct.info = type { i64 } + +@p = external local_unnamed_addr global i32, align 4 +@n = external local_unnamed_addr global i32, align 4 + +; Function Attrs: nounwind ssp uwtable +define void @f() local_unnamed_addr #0 !dbg !16 { +entry: + %0 = load i32, i32* @p, align 4, !dbg !25 + %tobool = icmp eq i32 %0, 0, !dbg !25 + br i1 %tobool, label %if.end, label %if.then, !dbg !26 + +if.then: ; preds = %entry + %1 = inttoptr i32 %0 to %struct.info*, !dbg !27 + tail call void @llvm.dbg.value(metadata %struct.info* %1, metadata !22, metadata !DIExpression()), !dbg !28 + ; CHECK: call void @llvm.dbg.value(metadata i32 %0, metadata !22, metadata !DIExpression()) + tail call void @llvm.dbg.value(metadata i32 0, metadata !20, metadata !DIExpression()), !dbg !29 + %2 = load i32, i32* @n, align 4, !dbg !30 + %cmp5 = icmp eq i32 %2, 0, !dbg !33 + br i1 %cmp5, label %if.end, label %for.body.preheader, !dbg !34 + +for.body.preheader: ; preds = %if.then + ; CHECK: for.body.preheader: + ; CHECK: %2 = inttoptr i32 %0 to %struct.info* + br label %for.body, !dbg !35 + +for.body: ; preds = %for.body.preheader, %for.body + %lsr.iv = phi %struct.info* [ %1, %for.body.preheader ], [ %scevgep, %for.body ] + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %lsr.iv7 = bitcast %struct.info* %lsr.iv to i64* + tail call void @llvm.dbg.value(metadata i32 %i.06, metadata !20, metadata !DIExpression()), !dbg !29 + %3 = load i64, i64* %lsr.iv7, align 8, !dbg !35 + %call = tail call i32 bitcast (i32 (...)* @use to i32 (i64)*)(i64 %3) #3, !dbg !36 + %inc = add nuw i32 %i.06, 1, !dbg !37 + tail call void @llvm.dbg.value(metadata i32 %inc, metadata !20, metadata !DIExpression()), !dbg !29 + %4 = load i32, i32* @n, align 4, !dbg !30 + %scevgep = getelementptr %struct.info, %struct.info* %lsr.iv, i32 1, !dbg !33 + %cmp = icmp ult i32 %inc, %4, !dbg !33 + br i1 %cmp, label %for.body, label %if.end.loopexit, !dbg !34, !llvm.loop !38 + +if.end.loopexit: ; preds = %for.body + br label %if.end, !dbg !40 + +if.end: ; preds = %if.end.loopexit, %if.then, %entry + ret void, !dbg !40 +} +declare i32 @use(...) local_unnamed_addr #1 + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, metadata, metadata) #2 + +attributes #0 = { nounwind ssp uwtable } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { nobuiltin nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!10, !11, !12, !13, !14} +!llvm.ident = !{!15} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3) +!1 = !DIFile(filename: "debug.i", directory: "/Data/radar/35321562") +!2 = !{} +!3 = !{!4} +!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 32) +!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "info_t", file: !1, line: 3, baseType: !6) +!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "info", file: !1, line: 1, size: 64, elements: !7) +!7 = !{!8} +!8 = !DIDerivedType(tag: DW_TAG_member, name: "size", scope: !6, file: !1, line: 2, baseType: !9, size: 64) +!9 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned) +!10 = !{i32 2, !"Dwarf Version", i32 4} +!11 = !{i32 2, !"Debug Info Version", i32 3} +!12 = !{i32 1, !"wchar_size", i32 4} +!13 = !{i32 1, !"min_enum_size", i32 4} +!14 = !{i32 7, !"PIC Level", i32 2} +!15 = !{!"clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)"} +!16 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 6, type: !17, isLocal: false, isDefinition: true, scopeLine: 6, isOptimized: true, unit: !0, variables: !19) +!17 = !DISubroutineType(types: !18) +!18 = !{null} +!19 = !{!20, !22} +!20 = !DILocalVariable(name: "i", scope: !16, file: !1, line: 7, type: !21) +!21 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) +!22 = !DILocalVariable(name: "info", scope: !23, file: !1, line: 9, type: !4) +!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 8, column: 10) +!24 = distinct !DILexicalBlock(scope: !16, file: !1, line: 8, column: 7) +!25 = !DILocation(line: 8, column: 7, scope: !24) +!26 = !DILocation(line: 8, column: 7, scope: !16) +!27 = !DILocation(line: 9, column: 20, scope: !23) +!28 = !DILocation(line: 9, column: 13, scope: !23) +!29 = !DILocation(line: 7, column: 16, scope: !16) +!30 = !DILocation(line: 10, column: 21, scope: !31) +!31 = distinct !DILexicalBlock(scope: !32, file: !1, line: 10, column: 5) +!32 = distinct !DILexicalBlock(scope: !23, file: !1, line: 10, column: 5) +!33 = !DILocation(line: 10, column: 19, scope: !31) +!34 = !DILocation(line: 10, column: 5, scope: !32) +!35 = !DILocation(line: 11, column: 19, scope: !31) +!36 = !DILocation(line: 11, column: 7, scope: !31) +!37 = !DILocation(line: 10, column: 25, scope: !31) +!38 = distinct !{!38, !34, !39} +!39 = !DILocation(line: 11, column: 23, scope: !32) +!40 = !DILocation(line: 13, column: 1, scope: !16) -- cgit v1.2.3 From 803f827385f6dce7f4b44867efdc84b332fd82d2 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Fri, 3 Nov 2017 22:32:11 +0000 Subject: Move TargetFrameLowering.h to CodeGen where it's implemented This header already includes a CodeGen header and is implemented in lib/CodeGen, so move the header there to match. This fixes a link error with modular codegeneration builds - where a header and its implementation are circularly dependent and so need to be in the same library, not split between two like this. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317379 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/TargetFrameLowering.h | 348 +++++++++++++++++++++ include/llvm/Target/TargetFrameLowering.h | 348 --------------------- lib/CodeGen/AsmPrinter/ARMException.cpp | 2 +- lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 2 +- lib/CodeGen/AsmPrinter/CodeViewDebug.cpp | 2 +- lib/CodeGen/AsmPrinter/DwarfCFIException.cpp | 2 +- lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp | 2 +- lib/CodeGen/AsmPrinter/WinException.cpp | 2 +- lib/CodeGen/FEntryInserter.cpp | 2 +- lib/CodeGen/GCRootLowering.cpp | 2 +- lib/CodeGen/GlobalISel/IRTranslator.cpp | 2 +- lib/CodeGen/LiveDebugValues.cpp | 2 +- lib/CodeGen/LocalStackSlotAllocation.cpp | 2 +- lib/CodeGen/MachineFrameInfo.cpp | 2 +- lib/CodeGen/MachineFunction.cpp | 2 +- lib/CodeGen/PatchableFunction.cpp | 2 +- lib/CodeGen/PrologEpilogInserter.cpp | 2 +- lib/CodeGen/RegUsageInfoCollector.cpp | 2 +- lib/CodeGen/RegisterClassInfo.cpp | 2 +- lib/CodeGen/RegisterScavenging.cpp | 2 +- lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp | 2 +- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 2 +- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 2 +- lib/CodeGen/ShrinkWrap.cpp | 2 +- lib/CodeGen/TargetFrameLoweringImpl.cpp | 2 +- lib/CodeGen/TargetInstrInfo.cpp | 2 +- lib/CodeGen/TargetOptionsImpl.cpp | 2 +- lib/CodeGen/TargetRegisterInfo.cpp | 2 +- lib/Target/AArch64/AArch64FrameLowering.h | 2 +- lib/Target/AArch64/AArch64RegisterInfo.cpp | 2 +- lib/Target/AMDGPU/AMDGPUFrameLowering.h | 2 +- lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 2 +- lib/Target/ARC/ARCFrameLowering.h | 2 +- lib/Target/ARC/ARCRegisterInfo.cpp | 2 +- lib/Target/ARM/ARMFrameLowering.h | 2 +- lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 2 +- lib/Target/ARM/ThumbRegisterInfo.cpp | 2 +- lib/Target/AVR/AVRFrameLowering.h | 2 +- lib/Target/AVR/AVRRegisterInfo.cpp | 2 +- lib/Target/BPF/BPFFrameLowering.h | 2 +- lib/Target/BPF/BPFRegisterInfo.cpp | 2 +- lib/Target/Hexagon/HexagonFrameLowering.h | 2 +- lib/Target/Lanai/LanaiFrameLowering.h | 2 +- lib/Target/Lanai/LanaiRegisterInfo.cpp | 2 +- lib/Target/Lanai/LanaiSubtarget.h | 2 +- lib/Target/Lanai/LanaiTargetMachine.h | 2 +- lib/Target/MSP430/MSP430FrameLowering.h | 2 +- lib/Target/MSP430/MSP430TargetMachine.h | 2 +- lib/Target/Mips/Mips16FrameLowering.cpp | 2 +- lib/Target/Mips/Mips16RegisterInfo.cpp | 2 +- lib/Target/Mips/MipsFrameLowering.h | 2 +- lib/Target/Mips/MipsISelLowering.cpp | 2 +- lib/Target/Mips/MipsRegisterInfo.cpp | 2 +- lib/Target/Mips/MipsSERegisterInfo.cpp | 2 +- lib/Target/NVPTX/NVPTXFrameLowering.h | 2 +- lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp | 2 +- lib/Target/NVPTX/NVPTXTargetMachine.h | 2 +- lib/Target/Nios2/Nios2FrameLowering.h | 2 +- lib/Target/PowerPC/PPCBranchCoalescing.cpp | 2 +- lib/Target/PowerPC/PPCFrameLowering.h | 2 +- lib/Target/PowerPC/PPCRegisterInfo.cpp | 2 +- lib/Target/RISCV/RISCVFrameLowering.h | 2 +- lib/Target/RISCV/RISCVRegisterInfo.cpp | 2 +- lib/Target/Sparc/SparcFrameLowering.h | 2 +- lib/Target/Sparc/SparcSubtarget.h | 2 +- lib/Target/SystemZ/SystemZFrameLowering.h | 2 +- lib/Target/SystemZ/SystemZRegisterInfo.cpp | 2 +- lib/Target/WebAssembly/WebAssemblyFrameLowering.h | 2 +- lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp | 2 +- lib/Target/X86/X86FrameLowering.h | 2 +- lib/Target/X86/X86RegisterInfo.cpp | 2 +- lib/Target/XCore/XCoreFrameLowering.h | 2 +- lib/Target/XCore/XCoreRegisterInfo.cpp | 2 +- unittests/CodeGen/MachineInstrTest.cpp | 2 +- 74 files changed, 420 insertions(+), 420 deletions(-) create mode 100644 include/llvm/CodeGen/TargetFrameLowering.h delete mode 100644 include/llvm/Target/TargetFrameLowering.h diff --git a/include/llvm/CodeGen/TargetFrameLowering.h b/include/llvm/CodeGen/TargetFrameLowering.h new file mode 100644 index 00000000000..5cf4627f3c9 --- /dev/null +++ b/include/llvm/CodeGen/TargetFrameLowering.h @@ -0,0 +1,348 @@ +//===-- llvm/CodeGen/TargetFrameLowering.h ---------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Interface to describe the layout of a stack frame on the target machine. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_TARGETFRAMELOWERING_H +#define LLVM_CODEGEN_TARGETFRAMELOWERING_H + +#include "llvm/CodeGen/MachineBasicBlock.h" +#include +#include + +namespace llvm { + class BitVector; + class CalleeSavedInfo; + class MachineFunction; + class RegScavenger; + +/// Information about stack frame layout on the target. It holds the direction +/// of stack growth, the known stack alignment on entry to each function, and +/// the offset to the locals area. +/// +/// The offset to the local area is the offset from the stack pointer on +/// function entry to the first location where function data (local variables, +/// spill locations) can be stored. +class TargetFrameLowering { +public: + enum StackDirection { + StackGrowsUp, // Adding to the stack increases the stack address + StackGrowsDown // Adding to the stack decreases the stack address + }; + + // Maps a callee saved register to a stack slot with a fixed offset. + struct SpillSlot { + unsigned Reg; + int Offset; // Offset relative to stack pointer on function entry. + }; +private: + StackDirection StackDir; + unsigned StackAlignment; + unsigned TransientStackAlignment; + int LocalAreaOffset; + bool StackRealignable; +public: + TargetFrameLowering(StackDirection D, unsigned StackAl, int LAO, + unsigned TransAl = 1, bool StackReal = true) + : StackDir(D), StackAlignment(StackAl), TransientStackAlignment(TransAl), + LocalAreaOffset(LAO), StackRealignable(StackReal) {} + + virtual ~TargetFrameLowering(); + + // These methods return information that describes the abstract stack layout + // of the target machine. + + /// getStackGrowthDirection - Return the direction the stack grows + /// + StackDirection getStackGrowthDirection() const { return StackDir; } + + /// getStackAlignment - This method returns the number of bytes to which the + /// stack pointer must be aligned on entry to a function. Typically, this + /// is the largest alignment for any data object in the target. + /// + unsigned getStackAlignment() const { return StackAlignment; } + + /// alignSPAdjust - This method aligns the stack adjustment to the correct + /// alignment. + /// + int alignSPAdjust(int SPAdj) const { + if (SPAdj < 0) { + SPAdj = -alignTo(-SPAdj, StackAlignment); + } else { + SPAdj = alignTo(SPAdj, StackAlignment); + } + return SPAdj; + } + + /// getTransientStackAlignment - This method returns the number of bytes to + /// which the stack pointer must be aligned at all times, even between + /// calls. + /// + unsigned getTransientStackAlignment() const { + return TransientStackAlignment; + } + + /// isStackRealignable - This method returns whether the stack can be + /// realigned. + bool isStackRealignable() const { + return StackRealignable; + } + + /// Return the skew that has to be applied to stack alignment under + /// certain conditions (e.g. stack was adjusted before function \p MF + /// was called). + virtual unsigned getStackAlignmentSkew(const MachineFunction &MF) const; + + /// getOffsetOfLocalArea - This method returns the offset of the local area + /// from the stack pointer on entrance to a function. + /// + int getOffsetOfLocalArea() const { return LocalAreaOffset; } + + /// isFPCloseToIncomingSP - Return true if the frame pointer is close to + /// the incoming stack pointer, false if it is close to the post-prologue + /// stack pointer. + virtual bool isFPCloseToIncomingSP() const { return true; } + + /// assignCalleeSavedSpillSlots - Allows target to override spill slot + /// assignment logic. If implemented, assignCalleeSavedSpillSlots() should + /// assign frame slots to all CSI entries and return true. If this method + /// returns false, spill slots will be assigned using generic implementation. + /// assignCalleeSavedSpillSlots() may add, delete or rearrange elements of + /// CSI. + virtual bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector &CSI) const { + return false; + } + + /// getCalleeSavedSpillSlots - This method returns a pointer to an array of + /// pairs, that contains an entry for each callee saved register that must be + /// spilled to a particular stack location if it is spilled. + /// + /// Each entry in this array contains a pair, indicating the + /// fixed offset from the incoming stack pointer that each register should be + /// spilled at. If a register is not listed here, the code generator is + /// allowed to spill it anywhere it chooses. + /// + virtual const SpillSlot * + getCalleeSavedSpillSlots(unsigned &NumEntries) const { + NumEntries = 0; + return nullptr; + } + + /// targetHandlesStackFrameRounding - Returns true if the target is + /// responsible for rounding up the stack frame (probably at emitPrologue + /// time). + virtual bool targetHandlesStackFrameRounding() const { + return false; + } + + /// Returns true if the target will correctly handle shrink wrapping. + virtual bool enableShrinkWrapping(const MachineFunction &MF) const { + return false; + } + + /// Returns true if the stack slot holes in the fixed and callee-save stack + /// area should be used when allocating other stack locations to reduce stack + /// size. + virtual bool enableStackSlotScavenging(const MachineFunction &MF) const { + return false; + } + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + virtual void emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const = 0; + virtual void emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const = 0; + + /// Replace a StackProbe stub (if any) with the actual probe code inline + virtual void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const {} + + /// Adjust the prologue to have the function use segmented stacks. This works + /// by adding a check even before the "normal" function prologue. + virtual void adjustForSegmentedStacks(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const {} + + /// Adjust the prologue to add Erlang Run-Time System (ERTS) specific code in + /// the assembly prologue to explicitly handle the stack. + virtual void adjustForHiPEPrologue(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const {} + + /// spillCalleeSavedRegisters - Issues instruction(s) to spill all callee + /// saved registers and returns true if it isn't possible / profitable to do + /// so by issuing a series of store instructions via + /// storeRegToStackSlot(). Returns false otherwise. + virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector &CSI, + const TargetRegisterInfo *TRI) const { + return false; + } + + /// restoreCalleeSavedRegisters - Issues instruction(s) to restore all callee + /// saved registers and returns true if it isn't possible / profitable to do + /// so by issuing a series of load instructions via loadRegToStackSlot(). + /// If it returns true, and any of the registers in CSI is not restored, + /// it sets the corresponding Restored flag in CSI to false. + /// Returns false otherwise. + virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + std::vector &CSI, + const TargetRegisterInfo *TRI) const { + return false; + } + + /// Return true if the target needs to disable frame pointer elimination. + virtual bool noFramePointerElim(const MachineFunction &MF) const; + + /// hasFP - Return true if the specified function should have a dedicated + /// frame pointer register. For most targets this is true only if the function + /// has variable sized allocas or if frame pointer elimination is disabled. + virtual bool hasFP(const MachineFunction &MF) const = 0; + + /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is + /// not required, we reserve argument space for call sites in the function + /// immediately on entry to the current function. This eliminates the need for + /// add/sub sp brackets around call sites. Returns true if the call frame is + /// included as part of the stack frame. + virtual bool hasReservedCallFrame(const MachineFunction &MF) const { + return !hasFP(MF); + } + + /// canSimplifyCallFramePseudos - When possible, it's best to simplify the + /// call frame pseudo ops before doing frame index elimination. This is + /// possible only when frame index references between the pseudos won't + /// need adjusting for the call frame adjustments. Normally, that's true + /// if the function has a reserved call frame or a frame pointer. Some + /// targets (Thumb2, for example) may have more complicated criteria, + /// however, and can override this behavior. + virtual bool canSimplifyCallFramePseudos(const MachineFunction &MF) const { + return hasReservedCallFrame(MF) || hasFP(MF); + } + + // needsFrameIndexResolution - Do we need to perform FI resolution for + // this function. Normally, this is required only when the function + // has any stack objects. However, targets may want to override this. + virtual bool needsFrameIndexResolution(const MachineFunction &MF) const; + + /// getFrameIndexReference - This method should return the base register + /// and offset used to reference a frame index location. The offset is + /// returned directly, and the base register is returned via FrameReg. + virtual int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const; + + /// Same as \c getFrameIndexReference, except that the stack pointer (as + /// opposed to the frame pointer) will be the preferred value for \p + /// FrameReg. This is generally used for emitting statepoint or EH tables that + /// use offsets from RSP. If \p IgnoreSPUpdates is true, the returned + /// offset is only guaranteed to be valid with respect to the value of SP at + /// the end of the prologue. + virtual int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, + unsigned &FrameReg, + bool IgnoreSPUpdates) const { + // Always safe to dispatch to getFrameIndexReference. + return getFrameIndexReference(MF, FI, FrameReg); + } + + /// This method determines which of the registers reported by + /// TargetRegisterInfo::getCalleeSavedRegs() should actually get saved. + /// The default implementation checks populates the \p SavedRegs bitset with + /// all registers which are modified in the function, targets may override + /// this function to save additional registers. + /// This method also sets up the register scavenger ensuring there is a free + /// register or a frameindex available. + virtual void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS = nullptr) const; + + /// processFunctionBeforeFrameFinalized - This method is called immediately + /// before the specified function's frame layout (MF.getFrameInfo()) is + /// finalized. Once the frame is finalized, MO_FrameIndex operands are + /// replaced with direct constants. This method is optional. + /// + virtual void processFunctionBeforeFrameFinalized(MachineFunction &MF, + RegScavenger *RS = nullptr) const { + } + + virtual unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const { + report_fatal_error("WinEH not implemented for this target"); + } + + /// This method is called during prolog/epilog code insertion to eliminate + /// call frame setup and destroy pseudo instructions (but only if the Target + /// is using them). It is responsible for eliminating these instructions, + /// replacing them with concrete instructions. This method need only be + /// implemented if using call frame setup/destroy pseudo instructions. + /// Returns an iterator pointing to the instruction after the replaced one. + virtual MachineBasicBlock::iterator + eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + llvm_unreachable("Call Frame Pseudo Instructions do not exist on this " + "target!"); + } + + + /// Order the symbols in the local stack frame. + /// The list of objects that we want to order is in \p objectsToAllocate as + /// indices into the MachineFrameInfo. The array can be reordered in any way + /// upon return. The contents of the array, however, may not be modified (i.e. + /// only their order may be changed). + /// By default, just maintain the original order. + virtual void + orderFrameObjects(const MachineFunction &MF, + SmallVectorImpl &objectsToAllocate) const { + } + + /// Check whether or not the given \p MBB can be used as a prologue + /// for the target. + /// The prologue will be inserted first in this basic block. + /// This method is used by the shrink-wrapping pass to decide if + /// \p MBB will be correctly handled by the target. + /// As soon as the target enable shrink-wrapping without overriding + /// this method, we assume that each basic block is a valid + /// prologue. + virtual bool canUseAsPrologue(const MachineBasicBlock &MBB) const { + return true; + } + + /// Check whether or not the given \p MBB can be used as a epilogue + /// for the target. + /// The epilogue will be inserted before the first terminator of that block. + /// This method is used by the shrink-wrapping pass to decide if + /// \p MBB will be correctly handled by the target. + /// As soon as the target enable shrink-wrapping without overriding + /// this method, we assume that each basic block is a valid + /// epilogue. + virtual bool canUseAsEpilogue(const MachineBasicBlock &MBB) const { + return true; + } + + /// Check if given function is safe for not having callee saved registers. + /// This is used when interprocedural register allocation is enabled. + static bool isSafeForNoCSROpt(const Function *F) { + if (!F->hasLocalLinkage() || F->hasAddressTaken() || + !F->hasFnAttribute(Attribute::NoRecurse)) + return false; + // Function should not be optimized as tail call. + for (const User *U : F->users()) + if (auto CS = ImmutableCallSite(U)) + if (CS.isTailCall()) + return false; + return true; + } +}; + +} // End llvm namespace + +#endif diff --git a/include/llvm/Target/TargetFrameLowering.h b/include/llvm/Target/TargetFrameLowering.h deleted file mode 100644 index 31017cbc27b..00000000000 --- a/include/llvm/Target/TargetFrameLowering.h +++ /dev/null @@ -1,348 +0,0 @@ -//===-- llvm/Target/TargetFrameLowering.h ---------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Interface to describe the layout of a stack frame on the target machine. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TARGET_TARGETFRAMELOWERING_H -#define LLVM_TARGET_TARGETFRAMELOWERING_H - -#include "llvm/CodeGen/MachineBasicBlock.h" -#include -#include - -namespace llvm { - class BitVector; - class CalleeSavedInfo; - class MachineFunction; - class RegScavenger; - -/// Information about stack frame layout on the target. It holds the direction -/// of stack growth, the known stack alignment on entry to each function, and -/// the offset to the locals area. -/// -/// The offset to the local area is the offset from the stack pointer on -/// function entry to the first location where function data (local variables, -/// spill locations) can be stored. -class TargetFrameLowering { -public: - enum StackDirection { - StackGrowsUp, // Adding to the stack increases the stack address - StackGrowsDown // Adding to the stack decreases the stack address - }; - - // Maps a callee saved register to a stack slot with a fixed offset. - struct SpillSlot { - unsigned Reg; - int Offset; // Offset relative to stack pointer on function entry. - }; -private: - StackDirection StackDir; - unsigned StackAlignment; - unsigned TransientStackAlignment; - int LocalAreaOffset; - bool StackRealignable; -public: - TargetFrameLowering(StackDirection D, unsigned StackAl, int LAO, - unsigned TransAl = 1, bool StackReal = true) - : StackDir(D), StackAlignment(StackAl), TransientStackAlignment(TransAl), - LocalAreaOffset(LAO), StackRealignable(StackReal) {} - - virtual ~TargetFrameLowering(); - - // These methods return information that describes the abstract stack layout - // of the target machine. - - /// getStackGrowthDirection - Return the direction the stack grows - /// - StackDirection getStackGrowthDirection() const { return StackDir; } - - /// getStackAlignment - This method returns the number of bytes to which the - /// stack pointer must be aligned on entry to a function. Typically, this - /// is the largest alignment for any data object in the target. - /// - unsigned getStackAlignment() const { return StackAlignment; } - - /// alignSPAdjust - This method aligns the stack adjustment to the correct - /// alignment. - /// - int alignSPAdjust(int SPAdj) const { - if (SPAdj < 0) { - SPAdj = -alignTo(-SPAdj, StackAlignment); - } else { - SPAdj = alignTo(SPAdj, StackAlignment); - } - return SPAdj; - } - - /// getTransientStackAlignment - This method returns the number of bytes to - /// which the stack pointer must be aligned at all times, even between - /// calls. - /// - unsigned getTransientStackAlignment() const { - return TransientStackAlignment; - } - - /// isStackRealignable - This method returns whether the stack can be - /// realigned. - bool isStackRealignable() const { - return StackRealignable; - } - - /// Return the skew that has to be applied to stack alignment under - /// certain conditions (e.g. stack was adjusted before function \p MF - /// was called). - virtual unsigned getStackAlignmentSkew(const MachineFunction &MF) const; - - /// getOffsetOfLocalArea - This method returns the offset of the local area - /// from the stack pointer on entrance to a function. - /// - int getOffsetOfLocalArea() const { return LocalAreaOffset; } - - /// isFPCloseToIncomingSP - Return true if the frame pointer is close to - /// the incoming stack pointer, false if it is close to the post-prologue - /// stack pointer. - virtual bool isFPCloseToIncomingSP() const { return true; } - - /// assignCalleeSavedSpillSlots - Allows target to override spill slot - /// assignment logic. If implemented, assignCalleeSavedSpillSlots() should - /// assign frame slots to all CSI entries and return true. If this method - /// returns false, spill slots will be assigned using generic implementation. - /// assignCalleeSavedSpillSlots() may add, delete or rearrange elements of - /// CSI. - virtual bool - assignCalleeSavedSpillSlots(MachineFunction &MF, - const TargetRegisterInfo *TRI, - std::vector &CSI) const { - return false; - } - - /// getCalleeSavedSpillSlots - This method returns a pointer to an array of - /// pairs, that contains an entry for each callee saved register that must be - /// spilled to a particular stack location if it is spilled. - /// - /// Each entry in this array contains a pair, indicating the - /// fixed offset from the incoming stack pointer that each register should be - /// spilled at. If a register is not listed here, the code generator is - /// allowed to spill it anywhere it chooses. - /// - virtual const SpillSlot * - getCalleeSavedSpillSlots(unsigned &NumEntries) const { - NumEntries = 0; - return nullptr; - } - - /// targetHandlesStackFrameRounding - Returns true if the target is - /// responsible for rounding up the stack frame (probably at emitPrologue - /// time). - virtual bool targetHandlesStackFrameRounding() const { - return false; - } - - /// Returns true if the target will correctly handle shrink wrapping. - virtual bool enableShrinkWrapping(const MachineFunction &MF) const { - return false; - } - - /// Returns true if the stack slot holes in the fixed and callee-save stack - /// area should be used when allocating other stack locations to reduce stack - /// size. - virtual bool enableStackSlotScavenging(const MachineFunction &MF) const { - return false; - } - - /// emitProlog/emitEpilog - These methods insert prolog and epilog code into - /// the function. - virtual void emitPrologue(MachineFunction &MF, - MachineBasicBlock &MBB) const = 0; - virtual void emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const = 0; - - /// Replace a StackProbe stub (if any) with the actual probe code inline - virtual void inlineStackProbe(MachineFunction &MF, - MachineBasicBlock &PrologueMBB) const {} - - /// Adjust the prologue to have the function use segmented stacks. This works - /// by adding a check even before the "normal" function prologue. - virtual void adjustForSegmentedStacks(MachineFunction &MF, - MachineBasicBlock &PrologueMBB) const {} - - /// Adjust the prologue to add Erlang Run-Time System (ERTS) specific code in - /// the assembly prologue to explicitly handle the stack. - virtual void adjustForHiPEPrologue(MachineFunction &MF, - MachineBasicBlock &PrologueMBB) const {} - - /// spillCalleeSavedRegisters - Issues instruction(s) to spill all callee - /// saved registers and returns true if it isn't possible / profitable to do - /// so by issuing a series of store instructions via - /// storeRegToStackSlot(). Returns false otherwise. - virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector &CSI, - const TargetRegisterInfo *TRI) const { - return false; - } - - /// restoreCalleeSavedRegisters - Issues instruction(s) to restore all callee - /// saved registers and returns true if it isn't possible / profitable to do - /// so by issuing a series of load instructions via loadRegToStackSlot(). - /// If it returns true, and any of the registers in CSI is not restored, - /// it sets the corresponding Restored flag in CSI to false. - /// Returns false otherwise. - virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - std::vector &CSI, - const TargetRegisterInfo *TRI) const { - return false; - } - - /// Return true if the target needs to disable frame pointer elimination. - virtual bool noFramePointerElim(const MachineFunction &MF) const; - - /// hasFP - Return true if the specified function should have a dedicated - /// frame pointer register. For most targets this is true only if the function - /// has variable sized allocas or if frame pointer elimination is disabled. - virtual bool hasFP(const MachineFunction &MF) const = 0; - - /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is - /// not required, we reserve argument space for call sites in the function - /// immediately on entry to the current function. This eliminates the need for - /// add/sub sp brackets around call sites. Returns true if the call frame is - /// included as part of the stack frame. - virtual bool hasReservedCallFrame(const MachineFunction &MF) const { - return !hasFP(MF); - } - - /// canSimplifyCallFramePseudos - When possible, it's best to simplify the - /// call frame pseudo ops before doing frame index elimination. This is - /// possible only when frame index references between the pseudos won't - /// need adjusting for the call frame adjustments. Normally, that's true - /// if the function has a reserved call frame or a frame pointer. Some - /// targets (Thumb2, for example) may have more complicated criteria, - /// however, and can override this behavior. - virtual bool canSimplifyCallFramePseudos(const MachineFunction &MF) const { - return hasReservedCallFrame(MF) || hasFP(MF); - } - - // needsFrameIndexResolution - Do we need to perform FI resolution for - // this function. Normally, this is required only when the function - // has any stack objects. However, targets may want to override this. - virtual bool needsFrameIndexResolution(const MachineFunction &MF) const; - - /// getFrameIndexReference - This method should return the base register - /// and offset used to reference a frame index location. The offset is - /// returned directly, and the base register is returned via FrameReg. - virtual int getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const; - - /// Same as \c getFrameIndexReference, except that the stack pointer (as - /// opposed to the frame pointer) will be the preferred value for \p - /// FrameReg. This is generally used for emitting statepoint or EH tables that - /// use offsets from RSP. If \p IgnoreSPUpdates is true, the returned - /// offset is only guaranteed to be valid with respect to the value of SP at - /// the end of the prologue. - virtual int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, - unsigned &FrameReg, - bool IgnoreSPUpdates) const { - // Always safe to dispatch to getFrameIndexReference. - return getFrameIndexReference(MF, FI, FrameReg); - } - - /// This method determines which of the registers reported by - /// TargetRegisterInfo::getCalleeSavedRegs() should actually get saved. - /// The default implementation checks populates the \p SavedRegs bitset with - /// all registers which are modified in the function, targets may override - /// this function to save additional registers. - /// This method also sets up the register scavenger ensuring there is a free - /// register or a frameindex available. - virtual void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, - RegScavenger *RS = nullptr) const; - - /// processFunctionBeforeFrameFinalized - This method is called immediately - /// before the specified function's frame layout (MF.getFrameInfo()) is - /// finalized. Once the frame is finalized, MO_FrameIndex operands are - /// replaced with direct constants. This method is optional. - /// - virtual void processFunctionBeforeFrameFinalized(MachineFunction &MF, - RegScavenger *RS = nullptr) const { - } - - virtual unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const { - report_fatal_error("WinEH not implemented for this target"); - } - - /// This method is called during prolog/epilog code insertion to eliminate - /// call frame setup and destroy pseudo instructions (but only if the Target - /// is using them). It is responsible for eliminating these instructions, - /// replacing them with concrete instructions. This method need only be - /// implemented if using call frame setup/destroy pseudo instructions. - /// Returns an iterator pointing to the instruction after the replaced one. - virtual MachineBasicBlock::iterator - eliminateCallFramePseudoInstr(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI) const { - llvm_unreachable("Call Frame Pseudo Instructions do not exist on this " - "target!"); - } - - - /// Order the symbols in the local stack frame. - /// The list of objects that we want to order is in \p objectsToAllocate as - /// indices into the MachineFrameInfo. The array can be reordered in any way - /// upon return. The contents of the array, however, may not be modified (i.e. - /// only their order may be changed). - /// By default, just maintain the original order. - virtual void - orderFrameObjects(const MachineFunction &MF, - SmallVectorImpl &objectsToAllocate) const { - } - - /// Check whether or not the given \p MBB can be used as a prologue - /// for the target. - /// The prologue will be inserted first in this basic block. - /// This method is used by the shrink-wrapping pass to decide if - /// \p MBB will be correctly handled by the target. - /// As soon as the target enable shrink-wrapping without overriding - /// this method, we assume that each basic block is a valid - /// prologue. - virtual bool canUseAsPrologue(const MachineBasicBlock &MBB) const { - return true; - } - - /// Check whether or not the given \p MBB can be used as a epilogue - /// for the target. - /// The epilogue will be inserted before the first terminator of that block. - /// This method is used by the shrink-wrapping pass to decide if - /// \p MBB will be correctly handled by the target. - /// As soon as the target enable shrink-wrapping without overriding - /// this method, we assume that each basic block is a valid - /// epilogue. - virtual bool canUseAsEpilogue(const MachineBasicBlock &MBB) const { - return true; - } - - /// Check if given function is safe for not having callee saved registers. - /// This is used when interprocedural register allocation is enabled. - static bool isSafeForNoCSROpt(const Function *F) { - if (!F->hasLocalLinkage() || F->hasAddressTaken() || - !F->hasFnAttribute(Attribute::NoRecurse)) - return false; - // Function should not be optimized as tail call. - for (const User *U : F->users()) - if (auto CS = ImmutableCallSite(U)) - if (CS.isTailCall()) - return false; - return true; - } -}; - -} // End llvm namespace - -#endif diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp index 8b1376ab363..973816d5635 100644 --- a/lib/CodeGen/AsmPrinter/ARMException.cpp +++ b/lib/CodeGen/AsmPrinter/ARMException.cpp @@ -29,7 +29,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/FormattedStream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegisterInfo.h" using namespace llvm; diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index a35fcdaaf9a..3081e761586 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -100,7 +100,7 @@ #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 67bab8c7684..5aa3f4ae103 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -68,7 +68,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/SMLoc.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp index dd7f7931b06..1a6cb967992 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp @@ -31,7 +31,7 @@ #include "llvm/MC/MachineLocation.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 06b5b06c41b..603d0f7f470 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -36,7 +36,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/Casting.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp index 5d485f21357..35ce1fec385 100644 --- a/lib/CodeGen/AsmPrinter/WinException.cpp +++ b/lib/CodeGen/AsmPrinter/WinException.cpp @@ -33,7 +33,7 @@ #include "llvm/MC/MCWin64EH.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/CodeGen/FEntryInserter.cpp b/lib/CodeGen/FEntryInserter.cpp index 9781338f952..3b38b5966b6 100644 --- a/lib/CodeGen/FEntryInserter.cpp +++ b/lib/CodeGen/FEntryInserter.cpp @@ -17,7 +17,7 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/CodeGen/GCRootLowering.cpp b/lib/CodeGen/GCRootLowering.cpp index 35246545ca9..9c0eea78777 100644 --- a/lib/CodeGen/GCRootLowering.cpp +++ b/lib/CodeGen/GCRootLowering.cpp @@ -24,7 +24,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp index 8e31ed0a015..45eb605c3c2 100644 --- a/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -54,7 +54,7 @@ #include "llvm/Support/LowLevelTypeImpl.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" diff --git a/lib/CodeGen/LiveDebugValues.cpp b/lib/CodeGen/LiveDebugValues.cpp index a45b1e39fee..bf6d5388937 100644 --- a/lib/CodeGen/LiveDebugValues.cpp +++ b/lib/CodeGen/LiveDebugValues.cpp @@ -46,7 +46,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp index 2eab0376da2..33ae476bf4a 100644 --- a/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -30,7 +30,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetOpcodes.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/CodeGen/MachineFrameInfo.cpp b/lib/CodeGen/MachineFrameInfo.cpp index be8adf75fb7..ba38005a93f 100644 --- a/lib/CodeGen/MachineFrameInfo.cpp +++ b/lib/CodeGen/MachineFrameInfo.cpp @@ -18,7 +18,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp index 250a10c7d07..570c410e1fe 100644 --- a/lib/CodeGen/MachineFunction.cpp +++ b/lib/CodeGen/MachineFunction.cpp @@ -58,7 +58,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/CodeGen/PatchableFunction.cpp b/lib/CodeGen/PatchableFunction.cpp index 513e8271656..b0424e70a47 100644 --- a/lib/CodeGen/PatchableFunction.cpp +++ b/lib/CodeGen/PatchableFunction.cpp @@ -16,7 +16,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp index d611c9b45c5..92a2566f0c1 100644 --- a/lib/CodeGen/PrologEpilogInserter.cpp +++ b/lib/CodeGen/PrologEpilogInserter.cpp @@ -55,7 +55,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOpcodes.h" diff --git a/lib/CodeGen/RegUsageInfoCollector.cpp b/lib/CodeGen/RegUsageInfoCollector.cpp index 214c6d2c820..3aaa5a4738d 100644 --- a/lib/CodeGen/RegUsageInfoCollector.cpp +++ b/lib/CodeGen/RegUsageInfoCollector.cpp @@ -27,7 +27,7 @@ #include "llvm/CodeGen/RegisterUsageInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" using namespace llvm; diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp index 956dec39fc3..8e463ff272d 100644 --- a/lib/CodeGen/RegisterClassInfo.cpp +++ b/lib/CodeGen/RegisterClassInfo.cpp @@ -24,7 +24,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp index 844ddb9ed3f..32194e6d76f 100644 --- a/lib/CodeGen/RegisterScavenging.cpp +++ b/lib/CodeGen/RegisterScavenging.cpp @@ -33,7 +33,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index b736037d71d..283ef1efd46 100644 --- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -32,7 +32,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index ff49134f7b9..356f2585046 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -40,7 +40,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index ccc06fa3ee1..c8abe25b7c6 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -98,7 +98,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetLowering.h" diff --git a/lib/CodeGen/ShrinkWrap.cpp b/lib/CodeGen/ShrinkWrap.cpp index 5fb6afee88a..1c6fb1ce785 100644 --- a/lib/CodeGen/ShrinkWrap.cpp +++ b/lib/CodeGen/ShrinkWrap.cpp @@ -73,7 +73,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp index 9dd98b4020d..64962a5b796 100644 --- a/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -20,7 +20,7 @@ #include "llvm/IR/Function.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Compiler.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp index bac12efd639..3010ab23828 100644 --- a/lib/CodeGen/TargetInstrInfo.cpp +++ b/lib/CodeGen/TargetInstrInfo.cpp @@ -26,7 +26,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/CodeGen/TargetOptionsImpl.cpp b/lib/CodeGen/TargetOptionsImpl.cpp index ed845e1706f..99ff4931e2f 100644 --- a/lib/CodeGen/TargetOptionsImpl.cpp +++ b/lib/CodeGen/TargetOptionsImpl.cpp @@ -15,7 +15,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp index 55318237e95..758fdabf5dd 100644 --- a/lib/CodeGen/TargetRegisterInfo.cpp +++ b/lib/CodeGen/TargetRegisterInfo.cpp @@ -27,7 +27,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/Printable.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index c351efb0c39..55a256867fa 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H #define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 91b1481f5ef..1059bc37c8f 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -26,7 +26,7 @@ #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/IR/Function.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 2329fffd521..91fe921bfee 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -15,7 +15,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 9fc9592bdc5..83122281d2b 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -23,7 +23,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/IR/MDBuilder.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include using namespace llvm; diff --git a/lib/Target/ARC/ARCFrameLowering.h b/lib/Target/ARC/ARCFrameLowering.h index ac5378adbd8..c042bec016c 100644 --- a/lib/Target/ARC/ARCFrameLowering.h +++ b/lib/Target/ARC/ARCFrameLowering.h @@ -17,7 +17,7 @@ #include "ARC.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/ARC/ARCRegisterInfo.cpp b/lib/Target/ARC/ARCRegisterInfo.cpp index 66f95911d3e..bed47a0eab5 100644 --- a/lib/Target/ARC/ARCRegisterInfo.cpp +++ b/lib/Target/ARC/ARCRegisterInfo.cpp @@ -25,7 +25,7 @@ #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/IR/Function.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h index 2c10031e3f8..1f18e2bf80c 100644 --- a/lib/Target/ARM/ARMFrameLowering.h +++ b/lib/Target/ARM/ARMFrameLowering.h @@ -11,7 +11,7 @@ #define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H #include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include namespace llvm { diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 4aa7e150342..2b4cdb7d97c 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -53,7 +53,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/Target/ARM/ThumbRegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp index 15a56752333..d2bebb9eeec 100644 --- a/lib/Target/ARM/ThumbRegisterInfo.cpp +++ b/lib/Target/ARM/ThumbRegisterInfo.cpp @@ -29,7 +29,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { diff --git a/lib/Target/AVR/AVRFrameLowering.h b/lib/Target/AVR/AVRFrameLowering.h index 30ef441183a..a0ba6c95127 100644 --- a/lib/Target/AVR/AVRFrameLowering.h +++ b/lib/Target/AVR/AVRFrameLowering.h @@ -10,7 +10,7 @@ #ifndef LLVM_AVR_FRAME_LOWERING_H #define LLVM_AVR_FRAME_LOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp index 7099b29a8bc..b6ac93452cb 100644 --- a/lib/Target/AVR/AVRRegisterInfo.cpp +++ b/lib/Target/AVR/AVRRegisterInfo.cpp @@ -18,7 +18,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/IR/Function.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "AVR.h" #include "AVRInstrInfo.h" diff --git a/lib/Target/BPF/BPFFrameLowering.h b/lib/Target/BPF/BPFFrameLowering.h index 5db963f518b..b4ffa0713fa 100644 --- a/lib/Target/BPF/BPFFrameLowering.h +++ b/lib/Target/BPF/BPFFrameLowering.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_BPF_BPFFRAMELOWERING_H #define LLVM_LIB_TARGET_BPF_BPFFRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class BPFSubtarget; diff --git a/lib/Target/BPF/BPFRegisterInfo.cpp b/lib/Target/BPF/BPFRegisterInfo.cpp index 273843e9270..cef905170f4 100644 --- a/lib/Target/BPF/BPFRegisterInfo.cpp +++ b/lib/Target/BPF/BPFRegisterInfo.cpp @@ -20,7 +20,7 @@ #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #define GET_REGINFO_TARGET_DESC diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h index 296edbe1eff..988718860c5 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.h +++ b/lib/Target/Hexagon/HexagonFrameLowering.h @@ -15,7 +15,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include namespace llvm { diff --git a/lib/Target/Lanai/LanaiFrameLowering.h b/lib/Target/Lanai/LanaiFrameLowering.h index 2f9b6c3c158..ca690d513fc 100644 --- a/lib/Target/Lanai/LanaiFrameLowering.h +++ b/lib/Target/Lanai/LanaiFrameLowering.h @@ -15,7 +15,7 @@ #define LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H #include "Lanai.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/Lanai/LanaiRegisterInfo.cpp b/lib/Target/Lanai/LanaiRegisterInfo.cpp index 6ea477dce3e..7d444a46d0f 100644 --- a/lib/Target/Lanai/LanaiRegisterInfo.cpp +++ b/lib/Target/Lanai/LanaiRegisterInfo.cpp @@ -23,7 +23,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #define GET_REGINFO_TARGET_DESC diff --git a/lib/Target/Lanai/LanaiSubtarget.h b/lib/Target/Lanai/LanaiSubtarget.h index 2732ef3097e..313d950e8aa 100644 --- a/lib/Target/Lanai/LanaiSubtarget.h +++ b/lib/Target/Lanai/LanaiSubtarget.h @@ -19,7 +19,7 @@ #include "LanaiInstrInfo.h" #include "LanaiSelectionDAGInfo.h" #include "llvm/IR/DataLayout.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/Target/Lanai/LanaiTargetMachine.h b/lib/Target/Lanai/LanaiTargetMachine.h index ce1271d9dea..2fb1a053610 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.h +++ b/lib/Target/Lanai/LanaiTargetMachine.h @@ -19,7 +19,7 @@ #include "LanaiInstrInfo.h" #include "LanaiSelectionDAGInfo.h" #include "LanaiSubtarget.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h index fdc4aa52a19..8807101f37c 100644 --- a/lib/Target/MSP430/MSP430FrameLowering.h +++ b/lib/Target/MSP430/MSP430FrameLowering.h @@ -15,7 +15,7 @@ #define LLVM_LIB_TARGET_MSP430_MSP430FRAMELOWERING_H #include "MSP430.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class MSP430FrameLowering : public TargetFrameLowering { diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h index 97b5e810a1d..4935b80cfdd 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.h +++ b/lib/Target/MSP430/MSP430TargetMachine.h @@ -16,7 +16,7 @@ #define LLVM_LIB_TARGET_MSP430_MSP430TARGETMACHINE_H #include "MSP430Subtarget.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp index 76bca3df2bc..cb59e2ddb1c 100644 --- a/lib/Target/Mips/Mips16FrameLowering.cpp +++ b/lib/Target/Mips/Mips16FrameLowering.cpp @@ -30,7 +30,7 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MachineLocation.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include #include #include diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp index 44771cbe8be..0ee0d73dc0a 100644 --- a/lib/Target/Mips/Mips16RegisterInfo.cpp +++ b/lib/Target/Mips/Mips16RegisterInfo.cpp @@ -29,7 +29,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h index 8c4214c4c21..883c3267d51 100644 --- a/lib/Target/Mips/MipsFrameLowering.h +++ b/lib/Target/Mips/MipsFrameLowering.h @@ -15,7 +15,7 @@ #define LLVM_LIB_TARGET_MIPS_MIPSFRAMELOWERING_H #include "Mips.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class MipsSubtarget; diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 38b3c3fb160..22a5a80a75c 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -62,7 +62,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp index 9c64a0ecbb1..ec966afee0e 100644 --- a/lib/Target/Mips/MipsRegisterInfo.cpp +++ b/lib/Target/Mips/MipsRegisterInfo.cpp @@ -28,7 +28,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp index 86bd24166bb..bd65cbf74af 100644 --- a/lib/Target/Mips/MipsSERegisterInfo.cpp +++ b/lib/Target/Mips/MipsSERegisterInfo.cpp @@ -30,7 +30,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h index 320ca9a2f09..a802cf85d2e 100644 --- a/lib/Target/NVPTX/NVPTXFrameLowering.h +++ b/lib/Target/NVPTX/NVPTXFrameLowering.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H #define LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class NVPTXSubtarget; diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp index 88288abe64f..3957d426653 100644 --- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp +++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp @@ -20,7 +20,7 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h index 7674135f0a7..54a72a688ee 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -17,7 +17,7 @@ #include "ManagedStringPool.h" #include "NVPTXSubtarget.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { diff --git a/lib/Target/Nios2/Nios2FrameLowering.h b/lib/Target/Nios2/Nios2FrameLowering.h index 2aaea678d9e..2d9e84b2c72 100644 --- a/lib/Target/Nios2/Nios2FrameLowering.h +++ b/lib/Target/Nios2/Nios2FrameLowering.h @@ -14,7 +14,7 @@ #define LLVM_LIB_TARGET_NIOS2_NIOS2FRAMELOWERING_H #include "Nios2.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class Nios2Subtarget; diff --git a/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/lib/Target/PowerPC/PPCBranchCoalescing.cpp index 33085a42361..1ba82042e6e 100644 --- a/lib/Target/PowerPC/PPCBranchCoalescing.cpp +++ b/lib/Target/PowerPC/PPCBranchCoalescing.cpp @@ -22,7 +22,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h index fa813db5fef..f845d5a9ac6 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.h +++ b/lib/Target/PowerPC/PPCFrameLowering.h @@ -15,7 +15,7 @@ #include "PPC.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index d46c1383297..e476ca0494d 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -37,7 +37,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/Target/RISCV/RISCVFrameLowering.h b/lib/Target/RISCV/RISCVFrameLowering.h index 14772ddac4a..71f85864a39 100644 --- a/lib/Target/RISCV/RISCVFrameLowering.h +++ b/lib/Target/RISCV/RISCVFrameLowering.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_RISCV_RISCVFRAMELOWERING_H #define LLVM_LIB_TARGET_RISCV_RISCVFRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class RISCVSubtarget; diff --git a/lib/Target/RISCV/RISCVRegisterInfo.cpp b/lib/Target/RISCV/RISCVRegisterInfo.cpp index 4f6c528061c..740b206b802 100644 --- a/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -19,7 +19,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #define GET_REGINFO_TARGET_DESC diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h index ac0e69ccde1..6098afa6898 100644 --- a/lib/Target/Sparc/SparcFrameLowering.h +++ b/lib/Target/Sparc/SparcFrameLowering.h @@ -15,7 +15,7 @@ #define LLVM_LIB_TARGET_SPARC_SPARCFRAMELOWERING_H #include "Sparc.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h index bfbdb8d0b44..ad6b55a9fc9 100644 --- a/lib/Target/Sparc/SparcSubtarget.h +++ b/lib/Target/Sparc/SparcSubtarget.h @@ -19,7 +19,7 @@ #include "SparcInstrInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/IR/DataLayout.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #include diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h index 91c5a5d53a1..a75d111b029 100644 --- a/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/lib/Target/SystemZ/SystemZFrameLowering.h @@ -11,7 +11,7 @@ #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H #include "llvm/ADT/IndexedMap.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class SystemZTargetMachine; diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp index 05f93ce5162..a44fae523fe 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -13,7 +13,7 @@ #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" using namespace llvm; diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h index bf326fce88f..4cc7f5ae058 100644 --- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h +++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h @@ -16,7 +16,7 @@ #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYFRAMELOWERING_H #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYFRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class MachineFrameInfo; diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp index 9367464c806..5e7ebd19fac 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp @@ -24,7 +24,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index 38ac96e16d4..909319fc18f 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H #define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 1f49650340e..a9ea94337b9 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -33,7 +33,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h index 27584f4e2b6..e98e9cda11d 100644 --- a/lib/Target/XCore/XCoreFrameLowering.h +++ b/lib/Target/XCore/XCoreFrameLowering.h @@ -15,7 +15,7 @@ #ifndef LLVM_LIB_TARGET_XCORE_XCOREFRAMELOWERING_H #define LLVM_LIB_TARGET_XCORE_XCOREFRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp index d34e928b14f..a6cf6837009 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.cpp +++ b/lib/Target/XCore/XCoreRegisterInfo.cpp @@ -30,7 +30,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/unittests/CodeGen/MachineInstrTest.cpp b/unittests/CodeGen/MachineInstrTest.cpp index 89041e2ab22..ac2fffe8502 100644 --- a/unittests/CodeGen/MachineInstrTest.cpp +++ b/unittests/CodeGen/MachineInstrTest.cpp @@ -12,7 +12,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" -- cgit v1.2.3 From be2858c001c1456d7cbaee597866acdcdc74a229 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Nov 2017 22:48:13 +0000 Subject: [X86] Give unary PERMI priority over SHUF128 in lowerV8I64VectorShuffle to make it possible to fold a load. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317382 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 8 ++++---- test/CodeGen/X86/vector-shuffle-512-v8.ll | 21 +++++++++++++++++++-- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d65a65e365c..ea97dc2dccd 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -13709,10 +13709,6 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - if (SDValue Shuf128 = - lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) - return Shuf128; - if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on all four @@ -13734,6 +13730,10 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); } + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) + return Shuf128; + // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index 6c980559721..1d17ef109d2 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -1165,14 +1165,31 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_01014545: ; AVX512F: # BB#0: -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_01014545: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-32-NEXT: retl + + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01014545_mem(<8 x i64>* %ptr, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_01014545_mem: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5] +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_01014545_mem: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5] ; AVX512F-32-NEXT: retl + %a = load <8 x i64>, <8 x i64>* %ptr %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } -- cgit v1.2.3 From 1024a3777d981698e8a60c5b86c21ea5b05c5835 Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Fri, 3 Nov 2017 23:03:38 +0000 Subject: [CallSiteSplitting] Silence GCC's -Wparentheses. NFCI. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317385 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/CallSiteSplitting.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp index 2224cb2eb62..5e6bfc73ca5 100644 --- a/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -126,8 +126,8 @@ static bool createCallSitesOnOrPredicatedArgument( Instruction *&CallUntakenFromHeader = IsCSInTakenPath ? NewCSTakenFromNextCond : NewCSTakenFromHeader; - assert(Pred == ICmpInst::ICMP_EQ || - Pred == ICmpInst::ICMP_NE && + assert((Pred == ICmpInst::ICMP_EQ || + Pred == ICmpInst::ICMP_NE) && "Unexpected predicate in an OR condition"); // Set the constant value for agruments in the call predicated based on -- cgit v1.2.3 From f0732934fcd3a3abd9af1344c71de28778c1a9a2 Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Sat, 4 Nov 2017 00:44:01 +0000 Subject: [CallSiteSplitting] clang-format my last commit. NFCI. Thanks to Rui for pointing out. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317393 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/CallSiteSplitting.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp index 5e6bfc73ca5..b70ed8d7d4c 100644 --- a/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -126,9 +126,8 @@ static bool createCallSitesOnOrPredicatedArgument( Instruction *&CallUntakenFromHeader = IsCSInTakenPath ? NewCSTakenFromNextCond : NewCSTakenFromHeader; - assert((Pred == ICmpInst::ICMP_EQ || - Pred == ICmpInst::ICMP_NE) && - "Unexpected predicate in an OR condition"); + assert((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) && + "Unexpected predicate in an OR condition"); // Set the constant value for agruments in the call predicated based on // the OR condition. -- cgit v1.2.3 From f1b2e0b26a4eac07e92c73c5aeaac14f83724198 Mon Sep 17 00:00:00 2001 From: Sean Fertile Date: Sat, 4 Nov 2017 01:54:20 +0000 Subject: Revert "[LTO][ThinLTO] Use the linker resolutions to mark global values ..." Changes more tests then expected on one of the build bots. reverting to investigate. This reverts https://llvm.org/svn/llvm-project/llvm/trunk@317374 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317395 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/ModuleSummaryIndex.h | 12 ++------- include/llvm/IR/ModuleSummaryIndexYAML.h | 8 +++--- lib/Analysis/ModuleSummaryAnalysis.cpp | 9 +++---- lib/Bitcode/Reader/BitcodeReader.cpp | 4 +-- lib/Bitcode/Writer/BitcodeWriter.cpp | 2 -- lib/LTO/LTO.cpp | 21 ++++----------- lib/Transforms/Utils/FunctionImportUtils.cpp | 17 ------------ test/Bitcode/thinlto-summary-local-5.0.ll | 22 --------------- test/Bitcode/thinlto-summary-local-5.0.ll.bc | Bin 1028 -> 0 bytes test/LTO/Resolution/X86/comdat-mixed-lto.ll | 2 +- test/LTO/Resolution/X86/comdat.ll | 4 +-- test/LTO/Resolution/X86/commons.ll | 2 +- test/ThinLTO/X86/deadstrip.ll | 30 ++++++++------------- test/ThinLTO/X86/funcimport2.ll | 4 +-- test/ThinLTO/X86/internalize.ll | 9 +++---- test/ThinLTO/X86/reference_non_importable.ll | 2 +- test/Transforms/LowerTypeTests/import-unsat.ll | 1 - .../PGOProfile/thinlto_samplepgo_icp2.ll | 2 +- test/Transforms/WholeProgramDevirt/import-indir.ll | 1 - 19 files changed, 37 insertions(+), 115 deletions(-) delete mode 100644 test/Bitcode/thinlto-summary-local-5.0.ll delete mode 100644 test/Bitcode/thinlto-summary-local-5.0.ll.bc diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h index b1e58a2a0d9..2d664f41e3c 100644 --- a/include/llvm/IR/ModuleSummaryIndex.h +++ b/include/llvm/IR/ModuleSummaryIndex.h @@ -148,15 +148,11 @@ public: /// In combined summary, indicate that the global value is live. unsigned Live : 1; - /// Indicates that the linker resolved the symbol to a definition from - /// within the same linkage unit. - unsigned DSOLocal : 1; - /// Convenience Constructors explicit GVFlags(GlobalValue::LinkageTypes Linkage, - bool NotEligibleToImport, bool Live, bool IsLocal) + bool NotEligibleToImport, bool Live) : Linkage(Linkage), NotEligibleToImport(NotEligibleToImport), - Live(Live), DSOLocal(IsLocal) {} + Live(Live) {} }; private: @@ -233,10 +229,6 @@ public: void setLive(bool Live) { Flags.Live = Live; } - void setDSOLocal(bool Local) { Flags.DSOLocal = Local; } - - bool isDSOLocal() const { return Flags.DSOLocal; } - /// Flag that this global value cannot be imported. void setNotEligibleToImport() { Flags.NotEligibleToImport = true; } diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h index 4687f2d53e7..2f9990ca03d 100644 --- a/include/llvm/IR/ModuleSummaryIndexYAML.h +++ b/include/llvm/IR/ModuleSummaryIndexYAML.h @@ -135,7 +135,7 @@ template <> struct MappingTraits { struct FunctionSummaryYaml { unsigned Linkage; - bool NotEligibleToImport, Live, IsLocal; + bool NotEligibleToImport, Live; std::vector TypeTests; std::vector TypeTestAssumeVCalls, TypeCheckedLoadVCalls; @@ -177,7 +177,6 @@ template <> struct MappingTraits { io.mapOptional("Linkage", summary.Linkage); io.mapOptional("NotEligibleToImport", summary.NotEligibleToImport); io.mapOptional("Live", summary.Live); - io.mapOptional("Local", summary.IsLocal); io.mapOptional("TypeTests", summary.TypeTests); io.mapOptional("TypeTestAssumeVCalls", summary.TypeTestAssumeVCalls); io.mapOptional("TypeCheckedLoadVCalls", summary.TypeCheckedLoadVCalls); @@ -212,7 +211,7 @@ template <> struct CustomMappingTraits { Elem.SummaryList.push_back(llvm::make_unique( GlobalValueSummary::GVFlags( static_cast(FSum.Linkage), - FSum.NotEligibleToImport, FSum.Live, FSum.IsLocal), + FSum.NotEligibleToImport, FSum.Live), 0, FunctionSummary::FFlags{}, ArrayRef{}, ArrayRef{}, std::move(FSum.TypeTests), std::move(FSum.TypeTestAssumeVCalls), @@ -229,8 +228,7 @@ template <> struct CustomMappingTraits { FSums.push_back(FunctionSummaryYaml{ FSum->flags().Linkage, static_cast(FSum->flags().NotEligibleToImport), - static_cast(FSum->flags().Live), - static_cast(FSum->flags().DSOLocal), FSum->type_tests(), + static_cast(FSum->flags().Live), FSum->type_tests(), FSum->type_test_assume_vcalls(), FSum->type_checked_load_vcalls(), FSum->type_test_assume_const_vcalls(), FSum->type_checked_load_const_vcalls()}); diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp index 82db09ca97b..afd575e7273 100644 --- a/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -303,7 +303,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, // FIXME: refactor this to use the same code that inliner is using. F.isVarArg(); GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport, - /* Live = */ false, F.isDSOLocal()); + /* Live = */ false); FunctionSummary::FFlags FunFlags{ F.hasFnAttribute(Attribute::ReadNone), F.hasFnAttribute(Attribute::ReadOnly), @@ -329,7 +329,7 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V, findRefEdges(Index, &V, RefEdges, Visited); bool NonRenamableLocal = isNonRenamableLocal(V); GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal, - /* Live = */ false, V.isDSOLocal()); + /* Live = */ false); auto GVarSummary = llvm::make_unique(Flags, RefEdges.takeVector()); if (NonRenamableLocal) @@ -342,7 +342,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A, DenseSet &CantBePromoted) { bool NonRenamableLocal = isNonRenamableLocal(A); GlobalValueSummary::GVFlags Flags(A.getLinkage(), NonRenamableLocal, - /* Live = */ false, A.isDSOLocal()); + /* Live = */ false); auto AS = llvm::make_unique(Flags); auto *Aliasee = A.getBaseObject(); auto *AliaseeSummary = Index.getGlobalValueSummary(*Aliasee); @@ -410,8 +410,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( assert(GV->isDeclaration() && "Def in module asm already has definition"); GlobalValueSummary::GVFlags GVFlags(GlobalValue::InternalLinkage, /* NotEligibleToImport = */ true, - /* Live = */ true, - /* Local */ GV->isDSOLocal()); + /* Live = */ true); CantBePromoted.insert(GlobalValue::getGUID(Name)); // Create the appropriate summary type. if (Function *F = dyn_cast(GV)) { diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index d0f11db8f61..c2272260f44 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -889,9 +889,7 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags, // to work correctly on earlier versions, we must conservatively treat all // values as live. bool Live = (RawFlags & 0x2) || Version < 3; - bool Local = (RawFlags & 0x4); - - return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live, Local); + return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live); } static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) { diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index c5d376c9426..1e491aa066e 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -955,8 +955,6 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) { RawFlags |= Flags.NotEligibleToImport; // bool RawFlags |= (Flags.Live << 1); - RawFlags |= (Flags.DSOLocal << 2); - // Linkage don't need to be remapped at that time for the summary. Any future // change to the getEncodedLinkage() function will need to be taken into // account here as well. diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp index 9c737795b5a..017dd201f9c 100644 --- a/lib/LTO/LTO.cpp +++ b/lib/LTO/LTO.cpp @@ -630,9 +630,6 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef Syms, NonPrevailingComdats.insert(GV->getComdat()); cast(GV)->setComdat(nullptr); } - - // Set the 'local' flag based on the linker resolution for this symbol. - GV->setDSOLocal(Res.FinalDefinitionInLinkageUnit); } // Common resolution: collect the maximum size/alignment over all commons. // We also record if we see an instance of a common as prevailing, so that @@ -646,6 +643,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef Syms, CommonRes.Prevailing |= Res.Prevailing; } + // FIXME: use proposed local attribute for FinalDefinitionInLinkageUnit. } if (!M.getComdatSymbolTable().empty()) for (GlobalValue &GV : M.global_values()) @@ -700,10 +698,10 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms, assert(ResI != ResE); SymbolResolution Res = *ResI++; - if (!Sym.getIRName().empty()) { - auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier( - Sym.getIRName(), GlobalValue::ExternalLinkage, "")); - if (Res.Prevailing) { + if (Res.Prevailing) { + if (!Sym.getIRName().empty()) { + auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier( + Sym.getIRName(), GlobalValue::ExternalLinkage, "")); ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier(); // For linker redefined symbols (via --wrap or --defsym) we want to @@ -715,15 +713,6 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms, GUID, BM.getModuleIdentifier())) S->setLinkage(GlobalValue::WeakAnyLinkage); } - - // If the linker resolved the symbol to a local definition then mark it - // as local in the summary for the module we are adding. - if (Res.FinalDefinitionInLinkageUnit) { - if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( - GUID, BM.getModuleIdentifier())) { - S->setDSOLocal(true); - } - } } } diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp index 2e6fc4e8482..fbb61ac1ae9 100644 --- a/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -203,23 +203,6 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV, } void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { - - // Check the summaries to see if the symbol gets resolved to a known local - // definition. - if (GV.hasName()) { - ValueInfo VI = ImportIndex.getValueInfo(GV.getGUID()); - if (VI) { - // Need to check all summaries are local in case of hash collisions. - bool IsLocal = VI.getSummaryList().size() && - llvm::all_of(VI.getSummaryList(), - [](const std::unique_ptr &Summary) { - return Summary->isDSOLocal(); - }); - if (IsLocal) - GV.setDSOLocal(true); - } - } - bool DoPromote = false; if (GV.hasLocalLinkage() && ((DoPromote = shouldPromoteLocalToGlobal(&GV)) || isPerformingImport())) { diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll b/test/Bitcode/thinlto-summary-local-5.0.ll deleted file mode 100644 index cbc48d23df3..00000000000 --- a/test/Bitcode/thinlto-summary-local-5.0.ll +++ /dev/null @@ -1,22 +0,0 @@ -; Bitcode compatibility test for dso_local flag in thin-lto summaries. -; Checks that older bitcode summaries without the dso_local op are still -; properly parsed and don't set GlobalValues as dso_local. - -; RUN: llvm-dis < %s.bc | FileCheck %s -; RUN: llvm-bcanalyzer -dump %s.bc | FileCheck %s --check-prefix=BCAN - -define void @foo() { -;CHECK-DAG:define void @foo() - ret void -} - -@bar = global i32 0 -;CHECK-DAG: @bar = global i32 0 - -@baz = alias i32, i32* @bar -;CHECK-DAG: @bar = global i32 0 - -;BCAN: -;BCAN-NEXT: -;BCAN-NEXT: diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll.bc b/test/Bitcode/thinlto-summary-local-5.0.ll.bc deleted file mode 100644 index 8dc7ca0a74b..00000000000 Binary files a/test/Bitcode/thinlto-summary-local-5.0.ll.bc and /dev/null differ diff --git a/test/LTO/Resolution/X86/comdat-mixed-lto.ll b/test/LTO/Resolution/X86/comdat-mixed-lto.ll index d6022c64351..f6ee22e4161 100644 --- a/test/LTO/Resolution/X86/comdat-mixed-lto.ll +++ b/test/LTO/Resolution/X86/comdat-mixed-lto.ll @@ -17,7 +17,7 @@ ; would clash with the copy from this module. ; RUN: llvm-dis %t3.0.0.preopt.bc -o - | FileCheck %s ; CHECK: define internal void @__cxx_global_var_init() section ".text.startup" { -; CHECK: define available_externally dso_local void @testglobfunc() section ".text.startup" { +; CHECK: define available_externally void @testglobfunc() section ".text.startup" { ; ModuleID = 'comdat-mixed-lto.o' source_filename = "comdat-mixed-lto.cpp" diff --git a/test/LTO/Resolution/X86/comdat.ll b/test/LTO/Resolution/X86/comdat.ll index 94f28384231..60d082b3e0f 100644 --- a/test/LTO/Resolution/X86/comdat.ll +++ b/test/LTO/Resolution/X86/comdat.ll @@ -70,14 +70,14 @@ bb11: ; CHECK-DAG: @a23 = alias i32 (i8*), i32 (i8*)* @f1.2{{$}} ; CHECK-DAG: @a24 = alias i16, bitcast (i32 (i8*)* @f1.2 to i16*) -; CHECK: define weak_odr dso_local i32 @f1(i8*) comdat($c1) { +; CHECK: define weak_odr i32 @f1(i8*) comdat($c1) { ; CHECK-NEXT: bb10: ; CHECK-NEXT: br label %bb11{{$}} ; CHECK: bb11: ; CHECK-NEXT: ret i32 42 ; CHECK-NEXT: } -; CHECK: define internal dso_local i32 @f1.2(i8* %this) comdat($c2) { +; CHECK: define internal i32 @f1.2(i8* %this) comdat($c2) { ; CHECK-NEXT: bb20: ; CHECK-NEXT: store i8* %this, i8** null ; CHECK-NEXT: br label %bb21 diff --git a/test/LTO/Resolution/X86/commons.ll b/test/LTO/Resolution/X86/commons.ll index 8adfb87d6ed..28bf1ada4a8 100644 --- a/test/LTO/Resolution/X86/commons.ll +++ b/test/LTO/Resolution/X86/commons.ll @@ -4,7 +4,7 @@ ; RUN: llvm-dis -o - %t.out.0.0.preopt.bc | FileCheck %s ; A strong definition should override the common -; CHECK: @x = dso_local global i32 42, align 4 +; CHECK: @x = global i32 42, align 4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/test/ThinLTO/X86/deadstrip.ll b/test/ThinLTO/X86/deadstrip.ll index 90de3bb9a32..c19ccb01be3 100644 --- a/test/ThinLTO/X86/deadstrip.ll +++ b/test/ThinLTO/X86/deadstrip.ll @@ -18,8 +18,8 @@ ; RUN: -r %t2.bc,_boo,pl \ ; RUN: -r %t2.bc,_dead_func,pl \ ; RUN: -r %t2.bc,_another_dead_func,pl -; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s --check-prefix=LTO2 -; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=LTO2-CHECK2 +; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s +; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=CHECK2 ; RUN: llvm-nm %t.out.1 | FileCheck %s --check-prefix=CHECK2-NM ; RUN: llvm-bcanalyzer -dump %t.out.index.bc | FileCheck %s --check-prefix=COMBINED @@ -27,14 +27,14 @@ ; COMBINED-DAG: Date: Sat, 4 Nov 2017 06:00:11 +0000 Subject: llvm/test/Object/archive-SYM64-write.test: Delete large temp files. They are 8GiB total. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317401 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Object/archive-SYM64-write.test | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/Object/archive-SYM64-write.test b/test/Object/archive-SYM64-write.test index d03b54c58b3..161d6cb8191 100644 --- a/test/Object/archive-SYM64-write.test +++ b/test/Object/archive-SYM64-write.test @@ -8,6 +8,9 @@ # RUN: llvm-ar cr %t.lib %t %t2 %p/Inputs/trivial-object-test.elf-x86-64 # RUN: llvm-nm --print-armap %t.lib | FileCheck %s +# Delete temp files. They are too large. +# RUN: rm -f %t %t2 %t.lib + !ELF FileHeader: Class: ELFCLASS64 -- cgit v1.2.3 From 5473af6661103fb6509c89880a08b2f24a3f760a Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Sat, 4 Nov 2017 06:03:29 +0000 Subject: CMake: Let LLVM_BUILD_32_BITS aware of large file. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317402 91177308-0d34-0410-b5e6-96231b3b80d8 --- cmake/modules/HandleLLVMOptions.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake index 03b96645249..cf1ece24848 100644 --- a/cmake/modules/HandleLLVMOptions.cmake +++ b/cmake/modules/HandleLLVMOptions.cmake @@ -232,6 +232,10 @@ if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 ) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -m32") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -m32") set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -m32") + + # FIXME: CMAKE_SIZEOF_VOID_P is still 8 + add_definitions(-D_LARGEFILE_SOURCE) + add_definitions(-D_FILE_OFFSET_BITS=64) endif( LLVM_BUILD_32_BITS ) endif( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 ) @@ -242,6 +246,7 @@ if (ANDROID AND (ANDROID_NATIVE_API_LEVEL LESS 24)) set(LLVM_FORCE_SMALLFILE_FOR_ANDROID TRUE) endif() if( CMAKE_SIZEOF_VOID_P EQUAL 4 AND NOT LLVM_FORCE_SMALLFILE_FOR_ANDROID) + # FIXME: It isn't handled in LLVM_BUILD_32_BITS. add_definitions( -D_LARGEFILE_SOURCE ) add_definitions( -D_FILE_OFFSET_BITS=64 ) endif() -- cgit v1.2.3 From 19bc3f9a843b7072d63c2545d771b08d8d821d8d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 4 Nov 2017 06:44:47 +0000 Subject: [X86] Teach shuffle lowering to use 256-bit SHUF128 when possible. This allows masked operations to be used and allows the register allocator to use YMM16-31 if necessary. As a follow up I'll look into teaching EVEX->VEX how to turn this back into PERM2X128 if any of the additional features don't work out. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317403 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 10 + test/CodeGen/X86/avx-schedule.ll | 4 +- test/CodeGen/X86/avx2-schedule.ll | 4 +- test/CodeGen/X86/avx512-shuffle-schedule.ll | 768 +++++++++++------------- test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll | 384 ++++++------ test/CodeGen/X86/vector-shuffle-256-v16.ll | 18 +- test/CodeGen/X86/vector-shuffle-256-v32.ll | 32 +- test/CodeGen/X86/vector-shuffle-256-v4.ll | 52 +- test/CodeGen/X86/vector-shuffle-256-v8.ll | 90 ++- 9 files changed, 678 insertions(+), 684 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ea97dc2dccd..3883415501b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -12384,6 +12384,16 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); } } + + // Try to use SHUF128 if possible. + if (Subtarget.hasVLX()) { + if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) { + unsigned PermMask = ((WidenedMask[0] % 2) << 0) | + ((WidenedMask[1] % 2) << 1); + return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + DAG.getConstant(PermMask, DL, MVT::i8)); + } + } } // Otherwise form a 128-bit permutation. After accounting for undefs, diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll index 44d13db65c9..a3e6a18fbc9 100644 --- a/test/CodeGen/X86/avx-schedule.ll +++ b/test/CodeGen/X86/avx-schedule.ll @@ -3447,8 +3447,8 @@ define <4 x double> @test_perm2f128(<4 x double> %a0, <4 x double> %a1, <4 x dou ; ; SKX-LABEL: test_perm2f128: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] ; diff --git a/test/CodeGen/X86/avx2-schedule.ll b/test/CodeGen/X86/avx2-schedule.ll index cec8ca94409..8febe046d81 100644 --- a/test/CodeGen/X86/avx2-schedule.ll +++ b/test/CodeGen/X86/avx2-schedule.ll @@ -2531,8 +2531,8 @@ define <4 x i64> @test_perm2i128(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; ; SKX-LABEL: test_perm2i128: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] ; diff --git a/test/CodeGen/X86/avx512-shuffle-schedule.ll b/test/CodeGen/X86/avx512-shuffle-schedule.ll index c59fb5b97bc..d1b6e1f7bd3 100755 --- a/test/CodeGen/X86/avx512-shuffle-schedule.ll +++ b/test/CodeGen/X86/avx512-shuffle-schedule.ll @@ -9520,12 +9520,12 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) { ; GENERIC-LABEL: test2_8xfloat_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res @@ -9533,18 +9533,18 @@ define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %ve define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9555,18 +9555,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x flo define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9576,18 +9574,18 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9598,18 +9596,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x flo define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9619,18 +9615,18 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9641,18 +9637,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x flo define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9662,12 +9656,12 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) { ; GENERIC-LABEL: test2_8xfloat_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res @@ -9675,18 +9669,18 @@ define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %ve define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9697,18 +9691,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x flo define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9718,12 +9710,12 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) { ; GENERIC-LABEL: test_8xfloat_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9732,18 +9724,18 @@ define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9755,18 +9747,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9778,18 +9768,18 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9801,18 +9791,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9824,18 +9812,18 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9847,18 +9835,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9870,12 +9856,12 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) { ; GENERIC-LABEL: test_8xfloat_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9884,18 +9870,18 @@ define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9907,18 +9893,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -10324,12 +10308,12 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) { ; GENERIC-LABEL: test_4xdouble_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res @@ -10337,18 +10321,18 @@ define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10359,18 +10343,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10380,18 +10362,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, < define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10402,18 +10384,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10423,18 +10403,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, < define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10445,18 +10425,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10466,12 +10444,12 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, < define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) { ; GENERIC-LABEL: test_4xdouble_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res @@ -10479,18 +10457,18 @@ define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10501,18 +10479,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10522,12 +10498,12 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, < define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) { ; GENERIC-LABEL: test_4xdouble_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10536,18 +10512,18 @@ define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x doub define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10559,18 +10535,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10582,18 +10556,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10605,18 +10579,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10628,18 +10600,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10651,18 +10623,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10674,12 +10644,12 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) { ; GENERIC-LABEL: test_4xdouble_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10688,18 +10658,18 @@ define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x doub define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10711,18 +10681,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -11128,12 +11096,12 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { ; GENERIC-LABEL: test_8xi32_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> ret <8 x i32> %res @@ -11141,18 +11109,18 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11163,18 +11131,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11184,18 +11150,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11206,18 +11172,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11227,18 +11191,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11249,18 +11213,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11270,12 +11232,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { ; GENERIC-LABEL: test_8xi32_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> ret <8 x i32> %res @@ -11283,18 +11245,18 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11305,18 +11267,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11326,12 +11286,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; GENERIC-LABEL: test_8xi32_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11340,18 +11300,18 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11363,18 +11323,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11386,18 +11344,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11409,18 +11367,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11432,18 +11388,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11455,18 +11411,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11478,12 +11432,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; GENERIC-LABEL: test_8xi32_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11492,18 +11446,18 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11515,18 +11469,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11932,12 +11884,12 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { ; GENERIC-LABEL: test_4xi64_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> ret <4 x i64> %res @@ -11945,18 +11897,18 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -11967,18 +11919,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -11988,18 +11938,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12010,18 +11960,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12031,18 +11979,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12053,18 +12001,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12074,12 +12020,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { ; GENERIC-LABEL: test_4xi64_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> ret <4 x i64> %res @@ -12087,18 +12033,18 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12109,18 +12055,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12130,12 +12074,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; GENERIC-LABEL: test_4xi64_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12144,18 +12088,18 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12167,18 +12111,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12190,18 +12132,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12213,18 +12155,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12236,18 +12176,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12259,18 +12199,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12282,12 +12220,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; GENERIC-LABEL: test_4xi64_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12296,18 +12234,18 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12319,18 +12257,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> diff --git a/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll b/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll index c957a85a885..0a4334e8108 100644 --- a/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll +++ b/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll @@ -6,7 +6,7 @@ define <8 x float> @test_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) { ; CHECK-LABEL: test_8xfloat_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res @@ -14,10 +14,10 @@ define <8 x float> @test_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -28,10 +28,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x floa define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -41,10 +40,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -55,10 +54,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x floa define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -68,10 +66,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -82,10 +80,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x floa define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -95,7 +92,7 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) { ; CHECK-LABEL: test_8xfloat_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res @@ -103,10 +100,10 @@ define <8 x float> @test_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -117,10 +114,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x floa define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -130,7 +126,7 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) { ; CHECK-LABEL: test_8xfloat_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -139,10 +135,10 @@ define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -154,10 +150,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -169,10 +164,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -184,10 +179,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -199,10 +193,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -214,10 +208,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -229,7 +222,7 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) { ; CHECK-LABEL: test_8xfloat_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -238,10 +231,10 @@ define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -253,10 +246,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -522,7 +514,7 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) { ; CHECK-LABEL: test_4xdouble_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res @@ -530,10 +522,10 @@ define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -544,10 +536,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -557,10 +548,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, < define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -571,10 +562,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -584,10 +574,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, < define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -598,10 +588,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -611,7 +600,7 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, < define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) { ; CHECK-LABEL: test_4xdouble_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res @@ -619,10 +608,10 @@ define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -633,10 +622,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -646,7 +634,7 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, < define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) { ; CHECK-LABEL: test_4xdouble_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -655,10 +643,10 @@ define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x doub define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -670,10 +658,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -685,10 +672,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -700,10 +687,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -715,10 +701,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -730,10 +716,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -745,7 +730,7 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) { ; CHECK-LABEL: test_4xdouble_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -754,10 +739,10 @@ define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x doub define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -769,10 +754,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -1038,7 +1022,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { ; CHECK-LABEL: test_8xi32_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> ret <8 x i32> %res @@ -1046,10 +1030,10 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1060,10 +1044,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1073,10 +1056,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1087,10 +1070,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1100,10 +1082,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1114,10 +1096,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1127,7 +1108,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { ; CHECK-LABEL: test_8xi32_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> ret <8 x i32> %res @@ -1135,10 +1116,10 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1149,10 +1130,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1162,7 +1142,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; CHECK-LABEL: test_8xi32_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1171,10 +1151,10 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1186,10 +1166,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1201,10 +1180,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1216,10 +1195,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1231,10 +1209,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1246,10 +1224,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1261,7 +1238,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; CHECK-LABEL: test_8xi32_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1270,10 +1247,10 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1285,10 +1262,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1554,7 +1530,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { ; CHECK-LABEL: test_4xi64_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> ret <4 x i64> %res @@ -1562,10 +1538,10 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1576,10 +1552,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1589,10 +1564,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1603,10 +1578,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1616,10 +1590,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1630,10 +1604,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1643,7 +1616,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { ; CHECK-LABEL: test_4xi64_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> ret <4 x i64> %res @@ -1651,10 +1624,10 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1665,10 +1638,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1678,7 +1650,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; CHECK-LABEL: test_4xi64_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1687,10 +1659,10 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1702,10 +1674,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1717,10 +1688,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1732,10 +1703,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1747,10 +1717,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1762,10 +1732,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1777,7 +1746,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; CHECK-LABEL: test_4xi64_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1786,10 +1755,10 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1801,10 +1770,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index dd329d21dc9..cbc190d0db3 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -3963,10 +3963,20 @@ define <16 x i16> @concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31(<16 x i } define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc(<16 x i16> %a, <16 x i16> %b) { -; ALL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: retq +; AVX1-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: retq %ahi = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> %bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> %bc0hi = bitcast <8 x i16> %ahi to <16 x i8> diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 3c69f6160dd..c5c2312b161 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1682,11 +1682,17 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: -; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1702,11 +1708,17 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: -; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index cf1aaca4ee2..6e0e80b4016 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -418,21 +418,45 @@ define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) { } define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_3254: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_3254: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_3254: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_3254: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_3276: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_3276: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_3276: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_3276: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -1053,8 +1077,8 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_3254: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -1075,8 +1099,8 @@ define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_3276: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index b95e7cf008a..38891b46577 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -768,21 +768,33 @@ define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_7654fedc(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_7654fedc: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8f32_7654fedc: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8f32_7654fedc: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_fedc7654: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8f32_fedc7654: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8f32_fedc7654: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1789,21 +1801,33 @@ define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) { } define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_7654fedc: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i32_7654fedc: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i32_7654fedc: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_fedc7654: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i32_fedc7654: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i32_fedc7654: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2177,10 +2201,15 @@ define <8 x i32> @concat_v8i32_0123CDEF(<8 x i32> %a, <8 x i32> %b) { } define <8 x i32> @concat_v8i32_4567CDEF_bc(<8 x i32> %a0, <8 x i32> %a1) { -; ALL-LABEL: concat_v8i32_4567CDEF_bc: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: retq +; AVX1OR2-LABEL: concat_v8i32_4567CDEF_bc: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: concat_v8i32_4567CDEF_bc: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: retq %a0hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> %a1hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> %bc0hi = bitcast <4 x i32> %a0hi to <2 x i64> @@ -2191,10 +2220,15 @@ define <8 x i32> @concat_v8i32_4567CDEF_bc(<8 x i32> %a0, <8 x i32> %a1) { } define <8 x float> @concat_v8f32_4567CDEF_bc(<8 x float> %f0, <8 x float> %f1) { -; ALL-LABEL: concat_v8f32_4567CDEF_bc: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: retq +; AVX1OR2-LABEL: concat_v8f32_4567CDEF_bc: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: concat_v8f32_4567CDEF_bc: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: retq %a0 = bitcast <8 x float> %f0 to <4 x i64> %a1 = bitcast <8 x float> %f1 to <8 x i32> %a0hi = shufflevector <4 x i64> %a0, <4 x i64> undef, <2 x i32> -- cgit v1.2.3 From ce4da272347af8a89a8f2dc388565527a22aa75c Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Sat, 4 Nov 2017 06:55:55 +0000 Subject: llvm/test/lit.cfg.py: Don't set the feature "llvm-64-bits" if -m32 is specified. FIXME: LLVM_BUILD_32_BITS should modify host_triple. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317404 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/lit.cfg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/lit.cfg.py b/test/lit.cfg.py index 57dc1f07049..73a3b4b58a8 100644 --- a/test/lit.cfg.py +++ b/test/lit.cfg.py @@ -169,7 +169,8 @@ for arch in config.targets_to_build.split(): # Features known_arches = ["x86_64", "mips64", "ppc64", "aarch64"] -if any(config.llvm_host_triple.startswith(x) for x in known_arches): +if (config.host_ldflags.find("-m32") < 0 + and any(config.llvm_host_triple.startswith(x) for x in known_arches)): config.available_features.add("llvm-64-bits") # Others/can-execute.txt -- cgit v1.2.3 From dcf1ffe8a0867a311092f2379195e9b646e42c1d Mon Sep 17 00:00:00 2001 From: Sean Fertile Date: Sat, 4 Nov 2017 17:04:39 +0000 Subject: [LTO][ThinLTO] Use the linker resolutions to mark global values as dso_local. Now that we have a way to mark GlobalValues as local we can use the symbol resolutions that the linker plugin provides as part of lto/thinlto link step to refine the compilers view on what symbols will end up being local. Originally commited as r317374, but reverted in r317395 to update some missed tests. Differential Revision: https://reviews.llvm.org/D35702 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317408 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/ModuleSummaryIndex.h | 12 +++++++-- include/llvm/IR/ModuleSummaryIndexYAML.h | 8 +++--- lib/Analysis/ModuleSummaryAnalysis.cpp | 9 ++++--- lib/Bitcode/Reader/BitcodeReader.cpp | 4 ++- lib/Bitcode/Writer/BitcodeWriter.cpp | 2 ++ lib/LTO/LTO.cpp | 21 +++++++++++---- lib/Transforms/Utils/FunctionImportUtils.cpp | 17 ++++++++++++ test/Bitcode/thinlto-summary-local-5.0.ll | 22 +++++++++++++++ test/Bitcode/thinlto-summary-local-5.0.ll.bc | Bin 0 -> 1028 bytes test/LTO/Resolution/X86/comdat-mixed-lto.ll | 2 +- test/LTO/Resolution/X86/comdat.ll | 4 +-- test/LTO/Resolution/X86/commons.ll | 2 +- test/ThinLTO/X86/deadstrip.ll | 30 +++++++++++++-------- test/ThinLTO/X86/funcimport2.ll | 4 +-- test/ThinLTO/X86/internalize.ll | 9 ++++--- test/ThinLTO/X86/reference_non_importable.ll | 2 +- test/Transforms/LowerTypeTests/import-unsat.ll | 1 + .../PGOProfile/thinlto_samplepgo_icp2.ll | 2 +- test/Transforms/WholeProgramDevirt/import-indir.ll | 1 + test/tools/gold/X86/asm_undefined2.ll | 3 ++- test/tools/gold/X86/coff.ll | 2 +- test/tools/gold/X86/common.ll | 2 +- test/tools/gold/X86/emit-llvm.ll | 6 ++--- test/tools/gold/X86/global_with_section.ll | 16 +++++------ test/tools/gold/X86/parallel.ll | 8 +++--- test/tools/gold/X86/thinlto_linkonceresolution.ll | 2 +- test/tools/gold/X86/thinlto_weak_library.ll | 2 +- test/tools/gold/X86/visibility.ll | 2 +- 28 files changed, 137 insertions(+), 58 deletions(-) create mode 100644 test/Bitcode/thinlto-summary-local-5.0.ll create mode 100644 test/Bitcode/thinlto-summary-local-5.0.ll.bc diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h index 2d664f41e3c..b1e58a2a0d9 100644 --- a/include/llvm/IR/ModuleSummaryIndex.h +++ b/include/llvm/IR/ModuleSummaryIndex.h @@ -148,11 +148,15 @@ public: /// In combined summary, indicate that the global value is live. unsigned Live : 1; + /// Indicates that the linker resolved the symbol to a definition from + /// within the same linkage unit. + unsigned DSOLocal : 1; + /// Convenience Constructors explicit GVFlags(GlobalValue::LinkageTypes Linkage, - bool NotEligibleToImport, bool Live) + bool NotEligibleToImport, bool Live, bool IsLocal) : Linkage(Linkage), NotEligibleToImport(NotEligibleToImport), - Live(Live) {} + Live(Live), DSOLocal(IsLocal) {} }; private: @@ -229,6 +233,10 @@ public: void setLive(bool Live) { Flags.Live = Live; } + void setDSOLocal(bool Local) { Flags.DSOLocal = Local; } + + bool isDSOLocal() const { return Flags.DSOLocal; } + /// Flag that this global value cannot be imported. void setNotEligibleToImport() { Flags.NotEligibleToImport = true; } diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h index 2f9990ca03d..4687f2d53e7 100644 --- a/include/llvm/IR/ModuleSummaryIndexYAML.h +++ b/include/llvm/IR/ModuleSummaryIndexYAML.h @@ -135,7 +135,7 @@ template <> struct MappingTraits { struct FunctionSummaryYaml { unsigned Linkage; - bool NotEligibleToImport, Live; + bool NotEligibleToImport, Live, IsLocal; std::vector TypeTests; std::vector TypeTestAssumeVCalls, TypeCheckedLoadVCalls; @@ -177,6 +177,7 @@ template <> struct MappingTraits { io.mapOptional("Linkage", summary.Linkage); io.mapOptional("NotEligibleToImport", summary.NotEligibleToImport); io.mapOptional("Live", summary.Live); + io.mapOptional("Local", summary.IsLocal); io.mapOptional("TypeTests", summary.TypeTests); io.mapOptional("TypeTestAssumeVCalls", summary.TypeTestAssumeVCalls); io.mapOptional("TypeCheckedLoadVCalls", summary.TypeCheckedLoadVCalls); @@ -211,7 +212,7 @@ template <> struct CustomMappingTraits { Elem.SummaryList.push_back(llvm::make_unique( GlobalValueSummary::GVFlags( static_cast(FSum.Linkage), - FSum.NotEligibleToImport, FSum.Live), + FSum.NotEligibleToImport, FSum.Live, FSum.IsLocal), 0, FunctionSummary::FFlags{}, ArrayRef{}, ArrayRef{}, std::move(FSum.TypeTests), std::move(FSum.TypeTestAssumeVCalls), @@ -228,7 +229,8 @@ template <> struct CustomMappingTraits { FSums.push_back(FunctionSummaryYaml{ FSum->flags().Linkage, static_cast(FSum->flags().NotEligibleToImport), - static_cast(FSum->flags().Live), FSum->type_tests(), + static_cast(FSum->flags().Live), + static_cast(FSum->flags().DSOLocal), FSum->type_tests(), FSum->type_test_assume_vcalls(), FSum->type_checked_load_vcalls(), FSum->type_test_assume_const_vcalls(), FSum->type_checked_load_const_vcalls()}); diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp index afd575e7273..82db09ca97b 100644 --- a/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -303,7 +303,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, // FIXME: refactor this to use the same code that inliner is using. F.isVarArg(); GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport, - /* Live = */ false); + /* Live = */ false, F.isDSOLocal()); FunctionSummary::FFlags FunFlags{ F.hasFnAttribute(Attribute::ReadNone), F.hasFnAttribute(Attribute::ReadOnly), @@ -329,7 +329,7 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V, findRefEdges(Index, &V, RefEdges, Visited); bool NonRenamableLocal = isNonRenamableLocal(V); GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal, - /* Live = */ false); + /* Live = */ false, V.isDSOLocal()); auto GVarSummary = llvm::make_unique(Flags, RefEdges.takeVector()); if (NonRenamableLocal) @@ -342,7 +342,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A, DenseSet &CantBePromoted) { bool NonRenamableLocal = isNonRenamableLocal(A); GlobalValueSummary::GVFlags Flags(A.getLinkage(), NonRenamableLocal, - /* Live = */ false); + /* Live = */ false, A.isDSOLocal()); auto AS = llvm::make_unique(Flags); auto *Aliasee = A.getBaseObject(); auto *AliaseeSummary = Index.getGlobalValueSummary(*Aliasee); @@ -410,7 +410,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( assert(GV->isDeclaration() && "Def in module asm already has definition"); GlobalValueSummary::GVFlags GVFlags(GlobalValue::InternalLinkage, /* NotEligibleToImport = */ true, - /* Live = */ true); + /* Live = */ true, + /* Local */ GV->isDSOLocal()); CantBePromoted.insert(GlobalValue::getGUID(Name)); // Create the appropriate summary type. if (Function *F = dyn_cast(GV)) { diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index c2272260f44..d0f11db8f61 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -889,7 +889,9 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags, // to work correctly on earlier versions, we must conservatively treat all // values as live. bool Live = (RawFlags & 0x2) || Version < 3; - return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live); + bool Local = (RawFlags & 0x4); + + return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live, Local); } static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) { diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index 1e491aa066e..c5d376c9426 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -955,6 +955,8 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) { RawFlags |= Flags.NotEligibleToImport; // bool RawFlags |= (Flags.Live << 1); + RawFlags |= (Flags.DSOLocal << 2); + // Linkage don't need to be remapped at that time for the summary. Any future // change to the getEncodedLinkage() function will need to be taken into // account here as well. diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp index 017dd201f9c..9c737795b5a 100644 --- a/lib/LTO/LTO.cpp +++ b/lib/LTO/LTO.cpp @@ -630,6 +630,9 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef Syms, NonPrevailingComdats.insert(GV->getComdat()); cast(GV)->setComdat(nullptr); } + + // Set the 'local' flag based on the linker resolution for this symbol. + GV->setDSOLocal(Res.FinalDefinitionInLinkageUnit); } // Common resolution: collect the maximum size/alignment over all commons. // We also record if we see an instance of a common as prevailing, so that @@ -643,7 +646,6 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef Syms, CommonRes.Prevailing |= Res.Prevailing; } - // FIXME: use proposed local attribute for FinalDefinitionInLinkageUnit. } if (!M.getComdatSymbolTable().empty()) for (GlobalValue &GV : M.global_values()) @@ -698,10 +700,10 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms, assert(ResI != ResE); SymbolResolution Res = *ResI++; - if (Res.Prevailing) { - if (!Sym.getIRName().empty()) { - auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier( - Sym.getIRName(), GlobalValue::ExternalLinkage, "")); + if (!Sym.getIRName().empty()) { + auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier( + Sym.getIRName(), GlobalValue::ExternalLinkage, "")); + if (Res.Prevailing) { ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier(); // For linker redefined symbols (via --wrap or --defsym) we want to @@ -713,6 +715,15 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms, GUID, BM.getModuleIdentifier())) S->setLinkage(GlobalValue::WeakAnyLinkage); } + + // If the linker resolved the symbol to a local definition then mark it + // as local in the summary for the module we are adding. + if (Res.FinalDefinitionInLinkageUnit) { + if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( + GUID, BM.getModuleIdentifier())) { + S->setDSOLocal(true); + } + } } } diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp index fbb61ac1ae9..2e6fc4e8482 100644 --- a/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -203,6 +203,23 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV, } void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { + + // Check the summaries to see if the symbol gets resolved to a known local + // definition. + if (GV.hasName()) { + ValueInfo VI = ImportIndex.getValueInfo(GV.getGUID()); + if (VI) { + // Need to check all summaries are local in case of hash collisions. + bool IsLocal = VI.getSummaryList().size() && + llvm::all_of(VI.getSummaryList(), + [](const std::unique_ptr &Summary) { + return Summary->isDSOLocal(); + }); + if (IsLocal) + GV.setDSOLocal(true); + } + } + bool DoPromote = false; if (GV.hasLocalLinkage() && ((DoPromote = shouldPromoteLocalToGlobal(&GV)) || isPerformingImport())) { diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll b/test/Bitcode/thinlto-summary-local-5.0.ll new file mode 100644 index 00000000000..cbc48d23df3 --- /dev/null +++ b/test/Bitcode/thinlto-summary-local-5.0.ll @@ -0,0 +1,22 @@ +; Bitcode compatibility test for dso_local flag in thin-lto summaries. +; Checks that older bitcode summaries without the dso_local op are still +; properly parsed and don't set GlobalValues as dso_local. + +; RUN: llvm-dis < %s.bc | FileCheck %s +; RUN: llvm-bcanalyzer -dump %s.bc | FileCheck %s --check-prefix=BCAN + +define void @foo() { +;CHECK-DAG:define void @foo() + ret void +} + +@bar = global i32 0 +;CHECK-DAG: @bar = global i32 0 + +@baz = alias i32, i32* @bar +;CHECK-DAG: @bar = global i32 0 + +;BCAN: +;BCAN-NEXT: +;BCAN-NEXT: diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll.bc b/test/Bitcode/thinlto-summary-local-5.0.ll.bc new file mode 100644 index 00000000000..8dc7ca0a74b Binary files /dev/null and b/test/Bitcode/thinlto-summary-local-5.0.ll.bc differ diff --git a/test/LTO/Resolution/X86/comdat-mixed-lto.ll b/test/LTO/Resolution/X86/comdat-mixed-lto.ll index f6ee22e4161..d6022c64351 100644 --- a/test/LTO/Resolution/X86/comdat-mixed-lto.ll +++ b/test/LTO/Resolution/X86/comdat-mixed-lto.ll @@ -17,7 +17,7 @@ ; would clash with the copy from this module. ; RUN: llvm-dis %t3.0.0.preopt.bc -o - | FileCheck %s ; CHECK: define internal void @__cxx_global_var_init() section ".text.startup" { -; CHECK: define available_externally void @testglobfunc() section ".text.startup" { +; CHECK: define available_externally dso_local void @testglobfunc() section ".text.startup" { ; ModuleID = 'comdat-mixed-lto.o' source_filename = "comdat-mixed-lto.cpp" diff --git a/test/LTO/Resolution/X86/comdat.ll b/test/LTO/Resolution/X86/comdat.ll index 60d082b3e0f..94f28384231 100644 --- a/test/LTO/Resolution/X86/comdat.ll +++ b/test/LTO/Resolution/X86/comdat.ll @@ -70,14 +70,14 @@ bb11: ; CHECK-DAG: @a23 = alias i32 (i8*), i32 (i8*)* @f1.2{{$}} ; CHECK-DAG: @a24 = alias i16, bitcast (i32 (i8*)* @f1.2 to i16*) -; CHECK: define weak_odr i32 @f1(i8*) comdat($c1) { +; CHECK: define weak_odr dso_local i32 @f1(i8*) comdat($c1) { ; CHECK-NEXT: bb10: ; CHECK-NEXT: br label %bb11{{$}} ; CHECK: bb11: ; CHECK-NEXT: ret i32 42 ; CHECK-NEXT: } -; CHECK: define internal i32 @f1.2(i8* %this) comdat($c2) { +; CHECK: define internal dso_local i32 @f1.2(i8* %this) comdat($c2) { ; CHECK-NEXT: bb20: ; CHECK-NEXT: store i8* %this, i8** null ; CHECK-NEXT: br label %bb21 diff --git a/test/LTO/Resolution/X86/commons.ll b/test/LTO/Resolution/X86/commons.ll index 28bf1ada4a8..8adfb87d6ed 100644 --- a/test/LTO/Resolution/X86/commons.ll +++ b/test/LTO/Resolution/X86/commons.ll @@ -4,7 +4,7 @@ ; RUN: llvm-dis -o - %t.out.0.0.preopt.bc | FileCheck %s ; A strong definition should override the common -; CHECK: @x = global i32 42, align 4 +; CHECK: @x = dso_local global i32 42, align 4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/test/ThinLTO/X86/deadstrip.ll b/test/ThinLTO/X86/deadstrip.ll index c19ccb01be3..90de3bb9a32 100644 --- a/test/ThinLTO/X86/deadstrip.ll +++ b/test/ThinLTO/X86/deadstrip.ll @@ -18,8 +18,8 @@ ; RUN: -r %t2.bc,_boo,pl \ ; RUN: -r %t2.bc,_dead_func,pl \ ; RUN: -r %t2.bc,_another_dead_func,pl -; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s -; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=CHECK2 +; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s --check-prefix=LTO2 +; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=LTO2-CHECK2 ; RUN: llvm-nm %t.out.1 | FileCheck %s --check-prefix=CHECK2-NM ; RUN: llvm-bcanalyzer -dump %t.out.index.bc | FileCheck %s --check-prefix=COMBINED @@ -27,14 +27,14 @@ ; COMBINED-DAG: