aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>2018-07-13 16:14:22 +0000
committerGheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>2018-07-13 16:14:22 +0000
commit2f7f7ababf73667e7b00ab19394606c3a4f47712 (patch)
tree8b7d73312479b5d98e9a1aabfcfbdf9a33a68f51
parent07d9ad356558ea7700cc1794848a7e327f9192ea (diff)
[OpenMP][libomptarget] Fix data sharing and globalization infrastructure to work in SPMD mode
Summary: This patch fixes the data sharing infrastructure to work for the SPMD and non-SPMD cases. Reviewers: ABataev, grokos, carlo.bertolli, caomhin Reviewed By: ABataev, grokos Subscribers: guansong, openmp-commits Differential Revision: https://reviews.llvm.org/D49204 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@337013 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/data_sharing.cu163
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/interface.h1
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h13
3 files changed, 72 insertions, 105 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index 8340c08..2a1709f 100644
--- a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -44,7 +44,7 @@ __device__ static bool IsWarpMasterActiveThread() {
}
// Return true if this is the master thread.
__device__ static bool IsMasterThread() {
- return getMasterThreadId() == getThreadId();
+ return !isSPMDMode() && getMasterThreadId() == getThreadId();
}
/// Return the provided size aligned to the size of a pointer.
@@ -330,39 +330,40 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
// Runtime functions for trunk data sharing scheme.
////////////////////////////////////////////////////////////////////////////////
+INLINE void data_sharing_init_stack_common() {
+ omptarget_nvptx_TeamDescr *teamDescr =
+ &omptarget_nvptx_threadPrivateContext->TeamContext();
+
+ for (int WID = 0; WID < WARPSIZE; WID++) {
+ __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
+ DataSharingState.SlotPtr[WID] = RootS;
+ DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+ }
+}
+
// Initialize data sharing data structure. This function needs to be called
// once at the beginning of a data sharing context (coincides with the kernel
-// initialization).
+// initialization). This function is called only by the MASTER thread of each
+// team in non-SPMD mode.
EXTERN void __kmpc_data_sharing_init_stack() {
// This function initializes the stack pointer with the pointer to the
// statically allocated shared memory slots. The size of a shared memory
// slot is pre-determined to be 256 bytes.
+ data_sharing_init_stack_common();
+ omptarget_nvptx_globalArgs.Init();
+}
- // Initialize the data sharing structures. This section should only be
- // executed by the warp active master threads.
- if (IsWarpMasterActiveThread()) {
- unsigned WID = getWarpId();
- omptarget_nvptx_TeamDescr *teamDescr =
- &omptarget_nvptx_threadPrivateContext->TeamContext();
- __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID, IsMasterThread());
-
- // If a valid address has been returned then proceed with the initalization.
- // Otherwise the initialization of the slot has already happened in a
- // previous call to this function.
- if (RootS) {
- DataSharingState.SlotPtr[WID] = RootS;
- DataSharingState.TailPtr[WID] = RootS;
- DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
- }
- }
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization). This function is called in SPMD mode only.
+EXTERN void __kmpc_data_sharing_init_stack_spmd() {
+ // This function initializes the stack pointer with the pointer to the
+ // statically allocated shared memory slots. The size of a shared memory
+ // slot is pre-determined to be 256 bytes.
+ if (threadIdx.x == 0)
+ data_sharing_init_stack_common();
- // Currently we only support the sharing of variables between master and
- // workers. The list of references to shared variables exists only for
- // the master thread.
- if (IsMasterThread()) {
- // Initialize the list of references to arguments.
- omptarget_nvptx_globalArgs.Init();
- }
+ __threadfence_block();
}
// Called at the time of the kernel initialization. This is used to initilize
@@ -372,8 +373,6 @@ EXTERN void __kmpc_data_sharing_init_stack() {
// By default the globalized variables are stored in global memory. If the
// UseSharedMemory is set to true, the runtime will attempt to use shared memory
// as long as the size requested fits the pre-allocated size.
-//
-// Called by: master, TODO: call by workers
EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
int16_t UseSharedMemory) {
// Frame pointer must be visible to all workers in the same warp.
@@ -385,7 +384,6 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
// SlotP will point to either the shared memory slot or an existing
// global memory slot.
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
- __kmpc_data_sharing_slot *&TailSlotP = DataSharingState.TailPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
// Compute the total memory footprint of the requested data.
@@ -405,62 +403,31 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
// of the slot then we need to either re-use the next slot, if one exists,
// or create a new slot.
if (EndAddress < RequestedEndAddress) {
- size_t NewSize = PushSize;
-
- // The new or reused slot for holding the data being pushed.
__kmpc_data_sharing_slot *NewSlot = 0;
+ size_t NewSize = PushSize;
- // Check if there is a next slot.
- if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
- // Attempt to reuse an existing slot provided the data fits in the slot.
- // The leftover data space will not be used.
- ptrdiff_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
- (uintptr_t)(&ExistingSlot->Data[0]);
-
- // Try to add the data in the next available slot. Search for a slot
- // with enough space.
- while (ExistingSlotSize < NewSize) {
- SlotP->Next = ExistingSlot->Next;
- SlotP->Next->Prev = ExistingSlot->Prev;
- free(ExistingSlot);
- ExistingSlot = SlotP->Next;
- if (!ExistingSlot)
- break;
- ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
- (uintptr_t)(&ExistingSlot->Data[0]);
- }
-
- // Check if a slot has been found.
- if (ExistingSlotSize >= NewSize) {
- NewSlot = ExistingSlot;
- NewSlot->PrevSlotStackPtr = StackP;
- }
- }
-
- if (!NewSlot) {
- // Allocate at least the default size for each type of slot.
- size_t DefaultSlotSize =
- IsMasterThread() ? DS_Slot_Size : DS_Worker_Warp_Slot_Size;
- if (DefaultSlotSize > NewSize)
- NewSize = DefaultSlotSize;
- NewSlot = (__kmpc_data_sharing_slot *)malloc(
- sizeof(__kmpc_data_sharing_slot) + NewSize);
- NewSlot->Next = 0;
- NewSlot->Prev = SlotP;
- NewSlot->PrevSlotStackPtr = StackP;
- NewSlot->DataEnd = &NewSlot->Data[NewSize];
-
- // Newly allocated slots are also tail slots.
- TailSlotP = NewSlot;
+ // Allocate at least the default size for each type of slot.
+ // Master is a special case and even though there is only one thread,
+ // it can share more things with the workers. For uniformity, it uses
+ // the full size of a worker warp slot.
+ size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
+ if (DefaultSlotSize > NewSize)
+ NewSize = DefaultSlotSize;
+ NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
+ sizeof(__kmpc_data_sharing_slot) + NewSize,
+ "Global memory slot allocation.");
- // Make previous slot point to the newly allocated slot.
- SlotP->Next = NewSlot;
- }
+ NewSlot->Next = 0;
+ NewSlot->Prev = SlotP;
+ NewSlot->PrevSlotStackPtr = StackP;
+ NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
+ // Make previous slot point to the newly allocated slot.
+ SlotP->Next = NewSlot;
// The current slot becomes the new slot.
SlotP = NewSlot;
// The stack pointer always points to the next free stack frame.
- StackP = &NewSlot->Data[PushSize];
+ StackP = &NewSlot->Data[0] + PushSize;
// The frame pointer always points to the beginning of the frame.
FrameP = &NewSlot->Data[0];
} else {
@@ -489,37 +456,27 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
if (IsWarpMasterActiveThread()) {
unsigned WID = getWarpId();
+ // Current slot
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+
+ // Pointer to next available stack.
void *&StackP = DataSharingState.StackPtr[WID];
- // Pop current frame from slot.
+ // If the current slot is empty, we need to free the slot after the
+ // pop.
+ bool SlotEmpty = (StackP == &SlotP->Data[0]);
+
+ // Pop the frame.
StackP = FrameStart;
- // If we try to pop the last frame of the current slot we need to
- // move to the previous slot if there is one.
- const uintptr_t StartAddress = (uintptr_t)FrameStart;
- if (StartAddress == (uintptr_t)&SlotP->Data[0]) {
- if (SlotP->Prev) {
- // The new stack pointer is the end of the data field of the
- // previous slot. This will allow the stack pointer to be
- // used in the computation of the remaining data space in
- // the current slot.
- StackP = SlotP->PrevSlotStackPtr;
- // Reset SlotP to previous slot.
- SlotP = SlotP->Prev;
- }
+ if (SlotEmpty && SlotP->Prev) {
+ // Before removing the slot we need to reset StackP.
+ StackP = SlotP->PrevSlotStackPtr;
- // If this will "pop" the last global memory node then it is likely
- // that we are at the end of the data sharing region and we can
- // de-allocate any existing global memory slots.
- if (!SlotP->Prev) {
- __kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr[WID];
- while(Tail->Prev) {
- Tail = Tail->Prev;
- free(Tail->Next);
- }
- Tail->Next=0;
- }
+ // Remove the slot.
+ SlotP = SlotP->Prev;
+ SafeFree(SlotP->Next, "Free slot.");
+ SlotP->Next = 0;
}
}
diff --git a/libomptarget/deviceRTLs/nvptx/src/interface.h b/libomptarget/deviceRTLs/nvptx/src/interface.h
index a02d962..680df48 100644
--- a/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -485,6 +485,7 @@ EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
EXTERN void __kmpc_data_sharing_init_stack();
+EXTERN void __kmpc_data_sharing_init_stack_spmd();
EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
EXTERN void __kmpc_data_sharing_pop_stack(void *a);
EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index b8b6975..84c61f9 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -122,7 +122,6 @@ enum DATA_SHARING_SIZES {
struct DataSharingStateTy {
__kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
void *StackPtr[DS_Max_Warp_Number];
- __kmpc_data_sharing_slot *TailPtr[DS_Max_Warp_Number];
void *FramePtr[DS_Max_Warp_Number];
int32_t ActiveThreads[DS_Max_Warp_Number];
};
@@ -302,6 +301,16 @@ public:
return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
}
+ INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
+ worker_rootS[wid].DataEnd =
+ &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
+ // We currently do not have a next slot.
+ worker_rootS[wid].Next = 0;
+ worker_rootS[wid].Prev = 0;
+ worker_rootS[wid].PrevSlotStackPtr = 0;
+ return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
+ }
+
private:
omptarget_nvptx_TaskDescr
levelZeroTaskDescr; // icv for team master initial thread
@@ -311,7 +320,7 @@ private:
uint64_t lastprivateIterBuffer;
__align__(16)
- __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE - 1];
+ __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
__align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
};