aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2018-11-09 16:18:04 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2018-11-09 16:18:04 +0000
commit223835dbb1f8b240a8262100dbef25b02e6558c7 (patch)
tree414efddea92fac35a901bf8a165cf94060e4fff3
parent9096f2a5c68ea935cb3d1b08ab36a93c4d729cbd (diff)
[OPENMP][NVPTX]Allow to use shared memory for the
target|teams|distribute variables. If the total size of the variables, declared in target|teams|distribute regions, is less than the maximal size of shared memory available, the buffer is allocated in the shared memory. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@346507 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp150
-rw-r--r--lib/CodeGen/CGOpenMPRuntimeNVPTX.h6
-rw-r--r--test/OpenMP/nvptx_data_sharing.cpp11
-rw-r--r--test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp8
-rw-r--r--test/OpenMP/nvptx_parallel_codegen.cpp11
-rw-r--r--test/OpenMP/nvptx_parallel_for_codegen.cpp11
-rw-r--r--test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp9
-rw-r--r--test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp9
-rw-r--r--test/OpenMP/nvptx_teams_codegen.cpp26
9 files changed, 143 insertions, 98 deletions
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 86acf57d72..ab9631c1fe 100644
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -176,6 +176,9 @@ enum MachineConfiguration : unsigned {
/// Global memory alignment for performance.
GlobalMemoryAlignment = 128,
+
+ /// Maximal size of the shared memory buffer.
+ SharedMemorySize = 128,
};
enum NamedBarrier : unsigned {
@@ -1143,13 +1146,6 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
IsInTTDRegion = true;
// Reserve place for the globalized memory.
GlobalizedRecords.emplace_back();
- if (!StaticGlobalized) {
- StaticGlobalized = new llvm::GlobalVariable(
- CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
- llvm::GlobalValue::WeakAnyLinkage, nullptr,
- "_openmp_static_glob_rd$ptr");
- StaticGlobalized->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
- }
if (!KernelStaticGlobalized) {
KernelStaticGlobalized = new llvm::GlobalVariable(
CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
@@ -1277,13 +1273,6 @@ void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
IsInTTDRegion = true;
// Reserve place for the globalized memory.
GlobalizedRecords.emplace_back();
- if (!StaticGlobalized) {
- StaticGlobalized = new llvm::GlobalVariable(
- CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
- llvm::GlobalValue::WeakAnyLinkage, nullptr,
- "_openmp_static_glob_rd$ptr");
- StaticGlobalized->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
- }
if (!KernelStaticGlobalized) {
KernelStaticGlobalized = new llvm::GlobalVariable(
CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
@@ -2138,30 +2127,41 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
++GlobalizedRecords.back().RegionCounter;
if (GlobalizedRecords.back().Records.size() == 1) {
- assert(StaticGlobalized &&
- "Static pointer must be initialized already.");
- Address Buffer = CGF.EmitLoadOfPointer(
- Address(StaticGlobalized, CGM.getPointerAlign()),
- CGM.getContext()
- .getPointerType(CGM.getContext().VoidPtrTy)
- .castAs<PointerType>());
+ assert(KernelStaticGlobalized &&
+ "Kernel static pointer must be initialized already.");
+ auto *UseSharedMemory = new llvm::GlobalVariable(
+ CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
+ llvm::GlobalValue::InternalLinkage, nullptr,
+ "_openmp_static_kernel$is_shared");
+ UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
+ QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
+ /*DestWidth=*/16, /*Signed=*/0);
+ llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
+ Address(UseSharedMemory,
+ CGM.getContext().getTypeAlignInChars(Int16Ty)),
+ /*Volatile=*/false, Int16Ty, Loc);
+ auto *StaticGlobalized = new llvm::GlobalVariable(
+ CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
+ llvm::GlobalValue::WeakAnyLinkage, nullptr);
auto *RecSize = new llvm::GlobalVariable(
CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
llvm::GlobalValue::InternalLinkage, nullptr,
"_openmp_static_kernel$size");
RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
llvm::Value *Ld = CGF.EmitLoadOfScalar(
- Address(RecSize, CGM.getPointerAlign()), /*Volatile=*/false,
+ Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
CGM.getContext().getSizeType(), Loc);
llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
KernelStaticGlobalized, CGM.VoidPtrPtrTy);
- llvm::Value *GlobalRecordSizeArg[] = {
- Buffer.getPointer(), Ld,
- llvm::ConstantInt::getNullValue(CGM.Int16Ty), ResAddr};
+ llvm::Value *GlobalRecordSizeArg[] = {StaticGlobalized, Ld,
+ IsInSharedMemory, ResAddr};
CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
OMPRTL_NVPTX__kmpc_get_team_static_memory),
GlobalRecordSizeArg);
+ GlobalizedRecords.back().Buffer = StaticGlobalized;
GlobalizedRecords.back().RecSize = RecSize;
+ GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
+ GlobalizedRecords.back().Loc = Loc;
}
assert(KernelStaticGlobalized && "Global address must be set already.");
Address FrameAddr = CGF.EmitLoadOfPointer(
@@ -2336,10 +2336,16 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
--GlobalizedRecords.back().RegionCounter;
// Emit the restore function only in the target region.
if (GlobalizedRecords.back().RegionCounter == 0) {
+ QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
+ /*DestWidth=*/16, /*Signed=*/0);
+ llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
+ Address(GlobalizedRecords.back().UseSharedMemory,
+ CGM.getContext().getTypeAlignInChars(Int16Ty)),
+ /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(
OMPRTL_NVPTX__kmpc_restore_team_static_memory),
- llvm::ConstantInt::getNullValue(CGM.Int16Ty));
+ IsInSharedMemory);
}
} else {
CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
@@ -4507,21 +4513,24 @@ static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
void CGOpenMPRuntimeNVPTX::clear() {
if (!GlobalizedRecords.empty()) {
ASTContext &C = CGM.getContext();
+ llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
+ llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
RecordDecl *StaticRD = C.buildImplicitRecord(
"_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
StaticRD->startDefinition();
+ RecordDecl *SharedStaticRD = C.buildImplicitRecord(
+ "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
+ SharedStaticRD->startDefinition();
for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
if (Records.Records.empty())
continue;
unsigned Size = 0;
unsigned RecAlignment = 0;
for (const RecordDecl *RD : Records.Records) {
- QualType RDTy = CGM.getContext().getRecordType(RD);
- unsigned Alignment =
- CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
+ QualType RDTy = C.getRecordType(RD);
+ unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
RecAlignment = std::max(RecAlignment, Alignment);
- unsigned RecSize =
- CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
+ unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
Size =
llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
}
@@ -4529,32 +4538,67 @@ void CGOpenMPRuntimeNVPTX::clear() {
llvm::APInt ArySize(/*numBits=*/64, Size);
QualType SubTy = C.getConstantArrayType(
C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
- auto *Field = FieldDecl::Create(
- C, StaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
- C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
- /*BW=*/nullptr, /*Mutable=*/false,
- /*InitStyle=*/ICIS_NoInit);
+ const bool UseSharedMemory = Size <= SharedMemorySize;
+ auto *Field =
+ FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
+ SourceLocation(), SourceLocation(), nullptr, SubTy,
+ C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
+ /*BW=*/nullptr, /*Mutable=*/false,
+ /*InitStyle=*/ICIS_NoInit);
Field->setAccess(AS_public);
- StaticRD->addDecl(Field);
+ if (UseSharedMemory) {
+ SharedStaticRD->addDecl(Field);
+ SharedRecs.push_back(&Records);
+ } else {
+ StaticRD->addDecl(Field);
+ GlobalRecs.push_back(&Records);
+ }
Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
+ Records.UseSharedMemory->setInitializer(
+ llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
+ }
+ SharedStaticRD->completeDefinition();
+ if (!SharedStaticRD->field_empty()) {
+ QualType StaticTy = C.getRecordType(SharedStaticRD);
+ llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
+ auto *GV = new llvm::GlobalVariable(
+ CGM.getModule(), LLVMStaticTy,
+ /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
+ llvm::Constant::getNullValue(LLVMStaticTy),
+ "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
+ llvm::GlobalValue::NotThreadLocal,
+ C.getTargetAddressSpace(LangAS::cuda_shared));
+ auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+ GV, CGM.VoidPtrTy);
+ for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
+ Rec->Buffer->replaceAllUsesWith(Replacement);
+ Rec->Buffer->eraseFromParent();
+ }
}
StaticRD->completeDefinition();
- QualType StaticTy = C.getRecordType(StaticRD);
- std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
- llvm::APInt Size1(32, SMsBlockPerSM.second);
- QualType Arr1Ty = C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
- /*IndexTypeQuals=*/0);
- llvm::APInt Size2(32, SMsBlockPerSM.first);
- QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
- /*IndexTypeQuals=*/0);
- llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
- auto *GV = new llvm::GlobalVariable(
- CGM.getModule(), LLVMArr2Ty,
- /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
- llvm::Constant::getNullValue(LLVMArr2Ty), "_openmp_static_glob_rd_$_");
- StaticGlobalized->setInitializer(
- llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
- CGM.VoidPtrTy));
+ if (!StaticRD->field_empty()) {
+ QualType StaticTy = C.getRecordType(StaticRD);
+ std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
+ llvm::APInt Size1(32, SMsBlockPerSM.second);
+ QualType Arr1Ty =
+ C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
+ /*IndexTypeQuals=*/0);
+ llvm::APInt Size2(32, SMsBlockPerSM.first);
+ QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
+ /*IndexTypeQuals=*/0);
+ llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
+ auto *GV = new llvm::GlobalVariable(
+ CGM.getModule(), LLVMArr2Ty,
+ /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
+ llvm::Constant::getNullValue(LLVMArr2Ty),
+ "_openmp_static_glob_rd_$_");
+ auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+ GV, CGM.VoidPtrTy);
+ for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
+ Rec->Buffer->replaceAllUsesWith(Replacement);
+ Rec->Buffer->eraseFromParent();
+ }
+ }
}
CGOpenMPRuntime::clear();
}
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
index dabc6b4f85..a157e421e0 100644
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -420,14 +420,14 @@ private:
/// union. This resulting union (one per CU) is the entry point for the static
/// memory management runtime functions.
struct GlobalPtrSizeRecsTy {
+ llvm::GlobalVariable *UseSharedMemory = nullptr;
llvm::GlobalVariable *RecSize = nullptr;
+ llvm::GlobalVariable *Buffer = nullptr;
+ SourceLocation Loc;
llvm::SmallVector<const RecordDecl *, 2> Records;
unsigned RegionCounter = 0;
};
llvm::SmallVector<GlobalPtrSizeRecsTy, 8> GlobalizedRecords;
- /// Global variable used for staticlly allocated global memoryused for
- /// globalization in target/teams/distribute regions.
- llvm::GlobalVariable *StaticGlobalized = nullptr;
/// Shared pointer for the global memory in the global memory buffer used for
/// the given kernel.
llvm::GlobalVariable *KernelStaticGlobalized = nullptr;
diff --git a/test/OpenMP/nvptx_data_sharing.cpp b/test/OpenMP/nvptx_data_sharing.cpp
index 2f251a7dd0..7f822aba92 100644
--- a/test/OpenMP/nvptx_data_sharing.cpp
+++ b/test/OpenMP/nvptx_data_sharing.cpp
@@ -27,10 +27,10 @@ void test_ds(){
}
}
// CK1: [[MEM_TY:%.+]] = type { [8 x i8] }
-// CK1-DAG: [[GLOBAL_RD:@.+]] = weak global [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]] zeroinitializer
-// CK1-DAG: [[GLOBAL_RD_PTR:@.+]] = weak unnamed_addr constant i8* getelementptr inbounds ([{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]], [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]]* [[GLOBAL_RD]], i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0)
+// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
// CK1-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CK1-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i64 8
+// CK1-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
/// ========= In the worker function ========= ///
// CK1: {{.*}}define internal void @__omp_offloading{{.*}}test_ds{{.*}}_worker()
@@ -44,9 +44,9 @@ void test_ds(){
// CK1: [[SHAREDARGS2:%.+]] = alloca i8**
// CK1: call void @__kmpc_kernel_init
// CK1: call void @__kmpc_data_sharing_init_stack
-// CK1: [[GLOBAL_RD:%.+]] = load i8*, i8** [[GLOBAL_RD_PTR]],
+// CK1: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// CK1: [[SIZE:%.+]] = load i64, i64* [[KERNEL_SIZE]],
-// CK1: call void @__kmpc_get_team_static_memory(i8* [[GLOBAL_RD]], i64 [[SIZE]], i16 0, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
+// CK1: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 [[SIZE]], i16 [[SHARED_MEM_FLAG]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// CK1: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// CK1: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i64 0
// CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty*
@@ -75,7 +75,8 @@ void test_ds(){
// CK1: call void @llvm.nvvm.barrier0()
// CK1: call void @llvm.nvvm.barrier0()
// CK1: call void @__kmpc_end_sharing_variables()
-// CK1: call void @__kmpc_restore_team_static_memory(i16 0)
+// CK1: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]],
+// CK1: call void @__kmpc_restore_team_static_memory(i16 [[SHARED_MEM_FLAG]])
// CK1: call void @__kmpc_kernel_deinit(i16 1)
/// ========= In the data sharing wrapper function ========= ///
diff --git a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
index c41964517b..71a3ad5491 100644
--- a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
+++ b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
@@ -22,8 +22,7 @@ int main(int argc, char **argv) {
}
// CHECK: [[MEM_TY:%.+]] = type { [84 x i8] }
-// CHECK-DAG: [[GLOBAL_RD:@.+]] = weak global [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]] zeroinitializer
-// CHECK-DAG: [[GLOBAL_RD_PTR:@.+]] = weak unnamed_addr constant i8* getelementptr inbounds ([{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]], [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]]* [[GLOBAL_RD]], i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0)
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 84
// CHECK-DAG: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 1
@@ -31,8 +30,7 @@ int main(int argc, char **argv) {
// CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_main_l17_worker(
// CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* dereferenceable(40) %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}})
-// CHECK: [[GLOBAL_RD:%.+]] = load i8*, i8** [[GLOBAL_RD_PTR]],
-// CHECK: call void @__kmpc_get_team_static_memory(i8* [[GLOBAL_RD]], i{{64|32}} 84, i16 0, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
+// CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 84, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// CHECK: [[STACK:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty*
// CHECK: [[ARGC:%.+]] = load i32, i32* %{{.+}}, align
@@ -48,7 +46,7 @@ int main(int argc, char **argv) {
// CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @
-// CHECK: call void @__kmpc_restore_team_static_memory(i16 0)
+// CHECK: call void @__kmpc_restore_team_static_memory(i16 1)
// CHECK: define internal void [[PARALLEL]](
// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack(
diff --git a/test/OpenMP/nvptx_parallel_codegen.cpp b/test/OpenMP/nvptx_parallel_codegen.cpp
index 5f0f09af87..a3b1b012bf 100644
--- a/test/OpenMP/nvptx_parallel_codegen.cpp
+++ b/test/OpenMP/nvptx_parallel_codegen.cpp
@@ -72,10 +72,10 @@ int bar(int n){
}
// CHECK: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[GLOBAL_RD:@.+]] = weak global [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]] zeroinitializer
-// CHECK-DAG: [[GLOBAL_RD_PTR:@.+]] = weak unnamed_addr constant i8* getelementptr inbounds ([{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]], [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]]* [[GLOBAL_RD]], i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0)
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
+// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
// CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker()
@@ -324,9 +324,9 @@ int bar(int n){
// CHECK-32: [[A_ADDR:%.+]] = alloca i32,
// CHECK-64: [[A_ADDR:%.+]] = alloca i64,
// CHECK-64: [[CONV:%.+]] = bitcast i64* [[A_ADDR]] to i32*
-// CHECK: [[GLOBAL_RD:%.+]] = load i8*, i8** [[GLOBAL_RD_PTR]],
+// CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
-// CHECK: call void @__kmpc_get_team_static_memory(i8* [[GLOBAL_RD]], i{{64|32}} [[SIZE]], i16 0, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
+// CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// CHECK: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// CHECK: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
// CHECK: [[BC:%.+]] = bitcast i8* [[STACK]] to %struct._globalized_locals_ty*
@@ -334,7 +334,8 @@ int bar(int n){
// CHECK-64: [[A:%.+]] = load i32, i32* [[CONV]],
// CHECK: [[GLOBAL_A_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: store i32 [[A]], i32* [[GLOBAL_A_ADDR]],
-// CHECK: call void @__kmpc_restore_team_static_memory(i16 0)
+// CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
+// CHECK: call void @__kmpc_restore_team_static_memory(i16 [[IS_SHARED]])
// CHECK-LABEL: define internal void @{{.+}}(i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.*}})
// CHECK: [[CC:%.+]] = alloca i32,
diff --git a/test/OpenMP/nvptx_parallel_for_codegen.cpp b/test/OpenMP/nvptx_parallel_for_codegen.cpp
index 75379c08b4..c292152e65 100644
--- a/test/OpenMP/nvptx_parallel_for_codegen.cpp
+++ b/test/OpenMP/nvptx_parallel_for_codegen.cpp
@@ -31,10 +31,10 @@ int bar(int n){
}
// CHECK: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[GLOBAL_RD:@.+]] = weak global [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]] zeroinitializer
-// CHECK-DAG: [[GLOBAL_RD_PTR:@.+]] = weak unnamed_addr constant i8* getelementptr inbounds ([{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]], [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]]* [[GLOBAL_RD]], i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0)
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
+// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l12}}_worker()
// CHECK: call void @llvm.nvvm.barrier0()
@@ -45,9 +45,9 @@ int bar(int n){
// CHECK: call void @__omp_offloading_{{.*}}l12_worker()
// CHECK: call void @__kmpc_kernel_init(
// CHECK: call void @__kmpc_data_sharing_init_stack()
-// CHECK: [[GLOBAL_RD:%.+]] = load i8*, i8** [[GLOBAL_RD_PTR]],
+// CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
-// CHECK: call void @__kmpc_get_team_static_memory(i8* [[GLOBAL_RD]], i{{64|32}} [[SIZE]], i16 0, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
+// CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 %7, i16 %6, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// CHECK: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// CHECK: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
// CHECK: call void @__kmpc_kernel_prepare_parallel(
@@ -55,7 +55,8 @@ int bar(int n){
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: call void @__kmpc_end_sharing_variables()
-// CHECK: call void @__kmpc_restore_team_static_memory(i16 0)
+// CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
+// CHECK: call void @__kmpc_restore_team_static_memory(i16 [[IS_SHARED]])
// CHECK: call void @__kmpc_kernel_deinit(i16 1)
// CHECK: define internal void @__omp_outlined__(
diff --git a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
index 3a46886d8c..0f95d5ad4e 100644
--- a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
+++ b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
@@ -68,23 +68,22 @@ int bar(int n){
}
// CHECK-DAG: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[GLOBAL_RD:@.+]] = weak global [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]] zeroinitializer
-// CHECK-DAG: [[GLOBAL_RD_PTR:@.+]] = weak unnamed_addr constant i8* getelementptr inbounds ([{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]], [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]]* [[GLOBAL_RD]], i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0)
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
+// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l32(
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 1)
-// CHECK: [[GLOBAL_RD:%.+]] = load i8*, i8** [[GLOBAL_RD_PTR]],
-// CHECK: call void @__kmpc_get_team_static_memory(i8* [[GLOBAL_RD]], i{{64|32}} 4, i16 0, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
+// CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// CHECK: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// CHECK: [[BC:%.+]] = bitcast i8* [[TEAM_ALLOC]] to [[REC:%.+]]*
// CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
// CHECK: {{call|invoke}} void [[OUTL1:@.+]](
// CHECK: call void @__kmpc_for_static_fini(
-// CHECK: call void @__kmpc_restore_team_static_memory(i16 0)
+// CHECK: call void @__kmpc_restore_team_static_memory(i16 1)
// CHECK: call void @__kmpc_spmd_kernel_deinit()
// CHECK: ret void
diff --git a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
index f7a3aaecc3..3b56bfd9a7 100644
--- a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
+++ b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -63,23 +63,22 @@ int bar(int n){
}
// CHECK-DAG: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[GLOBAL_RD:@.+]] = weak global [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]] zeroinitializer
-// CHECK-DAG: [[GLOBAL_RD_PTR:@.+]] = weak unnamed_addr constant i8* getelementptr inbounds ([{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]], [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]]* [[GLOBAL_RD]], i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0)
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
+// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l30(
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 1)
-// CHECK: [[GLOBAL_RD:%.+]] = load i8*, i8** [[GLOBAL_RD_PTR]],
-// CHECK: call void @__kmpc_get_team_static_memory(i8* [[GLOBAL_RD]], i{{64|32}} 4, i16 0, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
+// CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// CHECK: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// CHECK: [[BC:%.+]] = bitcast i8* [[TEAM_ALLOC]] to [[REC:%.+]]*
// CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
// CHECK: {{call|invoke}} void [[OUTL1:@.+]](
// CHECK: call void @__kmpc_for_static_fini(
-// CHECK: call void @__kmpc_restore_team_static_memory(i16 0)
+// CHECK: call void @__kmpc_restore_team_static_memory(i16 1)
// CHECK: call void @__kmpc_spmd_kernel_deinit()
// CHECK: ret void
diff --git a/test/OpenMP/nvptx_teams_codegen.cpp b/test/OpenMP/nvptx_teams_codegen.cpp
index af1ae40cde..98988a5d4d 100644
--- a/test/OpenMP/nvptx_teams_codegen.cpp
+++ b/test/OpenMP/nvptx_teams_codegen.cpp
@@ -28,11 +28,12 @@ int main (int argc, char **argv) {
}
// CK1: [[MEM_TY:%.+]] = type { [{{4|8}} x i8] }
-// CK1-DAG: [[GLOBAL_RD:@.+]] = weak global [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]] zeroinitializer
-// CK1-DAG: [[GLOBAL_RD_PTR:@.+]] = weak unnamed_addr constant i8* getelementptr inbounds ([{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]], [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]]* [[GLOBAL_RD]], i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0)
+// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
// CK1-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CK1-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// CK1-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}}
+// CK1-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
+// CK1-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
// only nvptx side: do not outline teams region and do not call fork_teams
// CK1: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[ARGC:%.+]])
@@ -43,9 +44,9 @@ int main (int argc, char **argv) {
// CK1: store {{.+}} 0, {{.+}},
// CK1: store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]],
// CK1-64: [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}*
-// CK1: [[GLOBAL_RD:%.+]] = load i8*, i8** [[GLOBAL_RD_PTR]],
+// CK1: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]],
// CK1: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]],
-// CK1: call void @__kmpc_get_team_static_memory(i8* [[GLOBAL_RD]], i{{64|32}} [[SIZE]], i16 0, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
+// CK1: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// CK1: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// CK1: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
// CK1-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]]
@@ -64,9 +65,9 @@ int main (int argc, char **argv) {
// CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{.+}}***,
// CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}**,
// CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]]
-// CK1: [[GLOBAL_RD:%.+]] = load i8*, i8** [[GLOBAL_RD_PTR]],
+// CK1: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]],
// CK1: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]],
-// CK1: call void @__kmpc_get_team_static_memory(i8* [[GLOBAL_RD]], i{{64|32}} [[SIZE]], i16 0, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
+// CK1: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// CK1: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// CK1: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
// CK1: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]]
@@ -114,11 +115,12 @@ int main (int argc, char **argv) {
}
// CK2: [[MEM_TY:%.+]] = type { [{{4|8}} x i8] }
-// CK2-DAG: [[GLOBAL_RD:@.+]] = weak global [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]] zeroinitializer
-// CK2-DAG: [[GLOBAL_RD_PTR:@.+]] = weak unnamed_addr constant i8* getelementptr inbounds ([{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]], [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]]* [[GLOBAL_RD]], i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0)
+// CK2-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
// CK2-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CK2-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// CK2-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}}
+// CK2-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
+// CK2-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[ARGC_IN:.+]])
// CK2: {{.}} = alloca i{{[0-9]+}}*,
@@ -133,9 +135,9 @@ int main (int argc, char **argv) {
// CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32*
// CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32*
// CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32*
-// CK2: [[GLOBAL_RD:%.+]] = load i8*, i8** [[GLOBAL_RD_PTR]],
+// CK2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]],
// CK2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]],
-// CK2: call void @__kmpc_get_team_static_memory(i8* [[GLOBAL_RD]], i{{64|32}} [[SIZE]], i16 0, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
+// CK2: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// CK2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// CK2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
// CK2-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]]
@@ -158,9 +160,9 @@ int main (int argc, char **argv) {
// CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]],
// CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]],
// CK2: store i{{[0-9]+}}** [[ARGC]], i{{[0-9]+}}*** [[ARGCADDR]],
-// CK2: [[GLOBAL_RD:%.+]] = load i8*, i8** [[GLOBAL_RD_PTR]],
+// CK2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]],
// CK2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]],
-// CK2: call void @__kmpc_get_team_static_memory(i8* [[GLOBAL_RD]], i{{64|32}} [[SIZE]], i16 0, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
+// CK2: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// CK2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// CK2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
// CK2: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]]