aboutsummaryrefslogtreecommitdiff
path: root/final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
diff options
context:
space:
mode:
Diffstat (limited to 'final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu')
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu581
1 files changed, 581 insertions, 0 deletions
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
new file mode 100644
index 0000000..50b8654
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -0,0 +1,581 @@
+//===----- data_sharing.cu - NVPTX OpenMP debug utilities -------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of data sharing environments/
+//
+//===----------------------------------------------------------------------===//
+#include "omptarget-nvptx.h"
+#include <stdio.h>
+
+// Warp ID in the CUDA block
+INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
+// Lane ID in the CUDA warp.
+INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
+
+// Return true if this is the first active thread in the warp.
+INLINE static bool IsWarpMasterActiveThread() {
+ unsigned long long Mask = __ACTIVEMASK();
+ unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
+ unsigned long long Sh = Mask << ShNum;
+ // Truncate Sh to the 32 lower bits
+ return (unsigned)Sh == 0;
+}
+// Return true if this is the master thread.
+INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
+ return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
+}
+
+/// Return the provided size aligned to the size of a pointer.
+INLINE static size_t AlignVal(size_t Val) {
+ const size_t Align = (size_t)sizeof(void *);
+ if (Val & (Align - 1)) {
+ Val += Align;
+ Val &= ~(Align - 1);
+ }
+ return Val;
+}
+
+#define DSFLAG 0
+#define DSFLAG_INIT 0
+#define DSPRINT(_flag, _str, _args...) \
+ { \
+ if (_flag) { \
+ /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/ \
+ } \
+ }
+#define DSPRINT0(_flag, _str) \
+ { \
+ if (_flag) { \
+ /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/ \
+ } \
+ }
+
+// Initialize the shared data structures. This is expected to be called for the
+// master thread and warp masters. \param RootS: A pointer to the root of the
+// data sharing stack. \param InitialDataSize: The initial size of the data in
+// the slot.
+EXTERN void
+__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
+ size_t InitialDataSize) {
+ ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
+ DSPRINT0(DSFLAG_INIT,
+ "Entering __kmpc_initialize_data_sharing_environment\n");
+
+ unsigned WID = getWarpId();
+ DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
+
+ omptarget_nvptx_TeamDescr *teamDescr =
+ &omptarget_nvptx_threadPrivateContext->TeamContext();
+ __kmpc_data_sharing_slot *RootS =
+ teamDescr->RootS(WID, IsMasterThread(isSPMDMode()));
+
+ DataSharingState.SlotPtr[WID] = RootS;
+ DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+
+ // We don't need to initialize the frame and active threads.
+
+ DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize);
+ DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS);
+ DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n",
+ (unsigned long long)RootS->DataEnd);
+ DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n",
+ (unsigned long long)RootS->Next);
+ DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n",
+ (unsigned long long)DataSharingState.SlotPtr[WID]);
+ DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n",
+ (unsigned long long)DataSharingState.StackPtr[WID]);
+
+ DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n");
+}
+
+EXTERN void *__kmpc_data_sharing_environment_begin(
+ __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
+ void **SavedSharedFrame, int32_t *SavedActiveThreads,
+ size_t SharingDataSize, size_t SharingDefaultDataSize,
+ int16_t IsOMPRuntimeInitialized) {
+
+ DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n");
+
+ // If the runtime has been elided, used __shared__ memory for master-worker
+ // data sharing.
+ if (!IsOMPRuntimeInitialized)
+ return (void *)&DataSharingState;
+
+ DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
+ DSPRINT(DSFLAG, "Default Data Size %016llx\n",
+ (unsigned long long)SharingDefaultDataSize);
+
+ unsigned WID = getWarpId();
+ unsigned CurActiveThreads = __ACTIVEMASK();
+
+ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+ void *&StackP = DataSharingState.StackPtr[WID];
+ void * volatile &FrameP = DataSharingState.FramePtr[WID];
+ int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
+
+ DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
+ // Save the current values.
+ *SavedSharedSlot = SlotP;
+ *SavedSharedStack = StackP;
+ *SavedSharedFrame = FrameP;
+ *SavedActiveThreads = ActiveT;
+
+ DSPRINT(DSFLAG, "Warp ID: %u\n", WID);
+ DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP);
+ DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP);
+ DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP);
+ DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
+
+ // Only the warp active master needs to grow the stack.
+ if (IsWarpMasterActiveThread()) {
+ // Save the current active threads.
+ ActiveT = CurActiveThreads;
+
+ // Make sure we use aligned sizes to avoid rematerialization of data.
+ SharingDataSize = AlignVal(SharingDataSize);
+ // FIXME: The default data size can be assumed to be aligned?
+ SharingDefaultDataSize = AlignVal(SharingDefaultDataSize);
+
+ // Check if we have room for the data in the current slot.
+ const uintptr_t CurrentStartAddress = (uintptr_t)StackP;
+ const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd;
+ const uintptr_t RequiredEndAddress =
+ CurrentStartAddress + (uintptr_t)SharingDataSize;
+
+ DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
+ DSPRINT(DSFLAG, "Default Data Size %016llx\n",
+ (unsigned long long)SharingDefaultDataSize);
+ DSPRINT(DSFLAG, "Current Start Address %016llx\n",
+ (unsigned long long)CurrentStartAddress);
+ DSPRINT(DSFLAG, "Current End Address %016llx\n",
+ (unsigned long long)CurrentEndAddress);
+ DSPRINT(DSFLAG, "Required End Address %016llx\n",
+ (unsigned long long)RequiredEndAddress);
+ DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT);
+
+ // If we require a new slot, allocate it and initialize it (or attempt to
+ // reuse one). Also, set the shared stack and slot pointers to the new
+ // place. If we do not need to grow the stack, just adapt the stack and
+ // frame pointers.
+ if (CurrentEndAddress < RequiredEndAddress) {
+ size_t NewSize = (SharingDataSize > SharingDefaultDataSize)
+ ? SharingDataSize
+ : SharingDefaultDataSize;
+ __kmpc_data_sharing_slot *NewSlot = 0;
+
+ // Attempt to reuse an existing slot.
+ if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
+ uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
+ (uintptr_t)(&ExistingSlot->Data[0]);
+ if (ExistingSlotSize >= NewSize) {
+ DSPRINT(DSFLAG, "Reusing stack slot %016llx\n",
+ (unsigned long long)ExistingSlot);
+ NewSlot = ExistingSlot;
+ } else {
+ DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n",
+ (unsigned long long)SlotP->Next);
+ free(ExistingSlot);
+ }
+ }
+
+ if (!NewSlot) {
+ NewSlot = (__kmpc_data_sharing_slot *)malloc(
+ sizeof(__kmpc_data_sharing_slot) + NewSize);
+ DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n",
+ (unsigned long long)NewSlot, NewSize);
+ }
+
+ NewSlot->Next = 0;
+ NewSlot->DataEnd = &NewSlot->Data[NewSize];
+
+ SlotP->Next = NewSlot;
+ SlotP = NewSlot;
+ StackP = &NewSlot->Data[SharingDataSize];
+ FrameP = &NewSlot->Data[0];
+ } else {
+
+ // Clean up any old slot that we may still have. The slot producers, do
+ // not eliminate them because that may be used to return data.
+ if (SlotP->Next) {
+ DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n",
+ (unsigned long long)SlotP->Next);
+ free(SlotP->Next);
+ SlotP->Next = 0;
+ }
+
+ FrameP = StackP;
+ StackP = (void *)RequiredEndAddress;
+ }
+ }
+
+ // FIXME: Need to see the impact of doing it here.
+ __threadfence_block();
+
+ DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n");
+
+ // All the threads in this warp get the frame they should work with.
+ return FrameP;
+}
+
+EXTERN void __kmpc_data_sharing_environment_end(
+ __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
+ void **SavedSharedFrame, int32_t *SavedActiveThreads,
+ int32_t IsEntryPoint) {
+
+ DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
+
+ unsigned WID = getWarpId();
+
+ if (IsEntryPoint) {
+ if (IsWarpMasterActiveThread()) {
+ DSPRINT0(DSFLAG, "Doing clean up\n");
+
+ // The master thread cleans the saved slot, because this is an environment
+ // only for the master.
+ __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode())
+ ? *SavedSharedSlot
+ : DataSharingState.SlotPtr[WID];
+
+ if (S->Next) {
+ free(S->Next);
+ S->Next = 0;
+ }
+ }
+
+ DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n");
+ return;
+ }
+
+ int32_t CurActive = __ACTIVEMASK();
+
+ // Only the warp master can restore the stack and frame information, and only
+ // if there are no other threads left behind in this environment (i.e. the
+ // warp diverged and returns in different places). This only works if we
+ // assume that threads will converge right after the call site that started
+ // the environment.
+ if (IsWarpMasterActiveThread()) {
+ int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
+
+ DSPRINT0(DSFLAG, "Before restoring the stack\n");
+ // Zero the bits in the mask. If it is still different from zero, then we
+ // have other threads that will return after the current ones.
+ ActiveT &= ~CurActive;
+
+ DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n",
+ (unsigned)CurActive, (unsigned)ActiveT);
+
+ if (!ActiveT) {
+ // No other active threads? Great, lets restore the stack.
+
+ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+ void *&StackP = DataSharingState.StackPtr[WID];
+ void * volatile &FrameP = DataSharingState.FramePtr[WID];
+
+ SlotP = *SavedSharedSlot;
+ StackP = *SavedSharedStack;
+ FrameP = *SavedSharedFrame;
+ ActiveT = *SavedActiveThreads;
+
+ DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n",
+ (unsigned long long)SlotP);
+ DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n",
+ (unsigned long long)StackP);
+ DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n",
+ (unsigned long long)FrameP);
+ DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
+ }
+ }
+
+ // FIXME: Need to see the impact of doing it here.
+ __threadfence_block();
+
+ DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n");
+ return;
+}
+
+EXTERN void *
+__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
+ int16_t IsOMPRuntimeInitialized) {
+ DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n");
+
+ // If the runtime has been elided, use __shared__ memory for master-worker
+ // data sharing. We're reusing the statically allocated data structure
+ // that is used for standard data sharing.
+ if (!IsOMPRuntimeInitialized)
+ return (void *)&DataSharingState;
+
+ // Get the frame used by the requested thread.
+
+ unsigned SourceWID = SourceThreadID / WARPSIZE;
+
+ DSPRINT(DSFLAG, "Source warp: %u\n", SourceWID);
+
+ void * volatile P = DataSharingState.FramePtr[SourceWID];
+ DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
+ return P;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Runtime functions for trunk data sharing scheme.
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE static void data_sharing_init_stack_common() {
+ ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
+ omptarget_nvptx_TeamDescr *teamDescr =
+ &omptarget_nvptx_threadPrivateContext->TeamContext();
+
+ for (int WID = 0; WID < WARPSIZE; WID++) {
+ __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
+ DataSharingState.SlotPtr[WID] = RootS;
+ DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+ }
+}
+
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization). This function is called only by the MASTER thread of each
+// team in non-SPMD mode.
+EXTERN void __kmpc_data_sharing_init_stack() {
+ ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
+ // This function initializes the stack pointer with the pointer to the
+ // statically allocated shared memory slots. The size of a shared memory
+ // slot is pre-determined to be 256 bytes.
+ data_sharing_init_stack_common();
+ omptarget_nvptx_globalArgs.Init();
+}
+
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization). This function is called in SPMD mode only.
+EXTERN void __kmpc_data_sharing_init_stack_spmd() {
+ ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
+ // This function initializes the stack pointer with the pointer to the
+ // statically allocated shared memory slots. The size of a shared memory
+ // slot is pre-determined to be 256 bytes.
+ if (threadIdx.x == 0)
+ data_sharing_init_stack_common();
+
+ __threadfence_block();
+}
+
+INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
+ ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
+
+ // Only warp active master threads manage the stack.
+ bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;
+
+ // Add worst-case padding to DataSize so that future stack allocations are
+ // correctly aligned.
+ const size_t Alignment = 8;
+ PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
+
+ // Frame pointer must be visible to all workers in the same warp.
+ const unsigned WID = getWarpId();
+ void *FrameP = 0;
+ int32_t CurActive = __ACTIVEMASK();
+
+ if (IsWarpMaster) {
+ // SlotP will point to either the shared memory slot or an existing
+ // global memory slot.
+ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+ void *&StackP = DataSharingState.StackPtr[WID];
+
+ // Check if we have room for the data in the current slot.
+ const uintptr_t StartAddress = (uintptr_t)StackP;
+ const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
+ const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
+
+ // If we requested more data than there is room for in the rest
+ // of the slot then we need to either re-use the next slot, if one exists,
+ // or create a new slot.
+ if (EndAddress < RequestedEndAddress) {
+ __kmpc_data_sharing_slot *NewSlot = 0;
+ size_t NewSize = PushSize;
+
+ // Allocate at least the default size for each type of slot.
+ // Master is a special case and even though there is only one thread,
+ // it can share more things with the workers. For uniformity, it uses
+ // the full size of a worker warp slot.
+ size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
+ if (DefaultSlotSize > NewSize)
+ NewSize = DefaultSlotSize;
+ NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
+ sizeof(__kmpc_data_sharing_slot) + NewSize,
+ "Global memory slot allocation.");
+
+ NewSlot->Next = 0;
+ NewSlot->Prev = SlotP;
+ NewSlot->PrevSlotStackPtr = StackP;
+ NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
+
+ // Make previous slot point to the newly allocated slot.
+ SlotP->Next = NewSlot;
+ // The current slot becomes the new slot.
+ SlotP = NewSlot;
+ // The stack pointer always points to the next free stack frame.
+ StackP = &NewSlot->Data[0] + PushSize;
+ // The frame pointer always points to the beginning of the frame.
+ FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
+ } else {
+ // Add the data chunk to the current slot. The frame pointer is set to
+ // point to the start of the new frame held in StackP.
+ FrameP = DataSharingState.FramePtr[WID] = StackP;
+ // Reset stack pointer to the requested address.
+ StackP = (void *)RequestedEndAddress;
+ }
+ }
+ // Get address from lane 0.
+ ((int *)&FrameP)[0] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[0], 0);
+ if (sizeof(FrameP) == 8)
+ ((int *)&FrameP)[1] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[1], 0);
+
+ return FrameP;
+}
+
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
+ int16_t UseSharedMemory) {
+ return data_sharing_push_stack_common(DataSize);
+}
+
+// Called at the time of the kernel initialization. This is used to initilize
+// the list of references to shared variables and to pre-allocate global storage
+// for holding the globalized variables.
+//
+// By default the globalized variables are stored in global memory. If the
+// UseSharedMemory is set to true, the runtime will attempt to use shared memory
+// as long as the size requested fits the pre-allocated size.
+EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
+ int16_t UseSharedMemory) {
+ // Compute the total memory footprint of the requested data.
+ // The master thread requires a stack only for itself. A worker
+ // thread (which at this point is a warp master) will require
+ // space for the variables of each thread in the warp,
+ // i.e. one DataSize chunk per warp lane.
+ // TODO: change WARPSIZE to the number of active threads in the warp.
+ size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
+ ? DataSize
+ : WARPSIZE * DataSize;
+
+ // Compute the start address of the frame of each thread in the warp.
+ uintptr_t FrameStartAddress =
+ (uintptr_t) data_sharing_push_stack_common(PushSize);
+ FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
+ return (void *)FrameStartAddress;
+}
+
+// Pop the stack and free any memory which can be reclaimed.
+//
+// When the pop operation removes the last global memory slot,
+// reclaim all outstanding global memory slots since it is
+// likely we have reached the end of the kernel.
+EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
+ ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
+
+ __threadfence_block();
+
+ if (GetThreadIdInBlock() % WARPSIZE == 0) {
+ unsigned WID = getWarpId();
+
+ // Current slot
+ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+
+ // Pointer to next available stack.
+ void *&StackP = DataSharingState.StackPtr[WID];
+
+ // Pop the frame.
+ StackP = FrameStart;
+
+ // If the current slot is empty, we need to free the slot after the
+ // pop.
+ bool SlotEmpty = (StackP == &SlotP->Data[0]);
+
+ if (SlotEmpty && SlotP->Prev) {
+ // Before removing the slot we need to reset StackP.
+ StackP = SlotP->PrevSlotStackPtr;
+
+ // Remove the slot.
+ SlotP = SlotP->Prev;
+ SafeFree(SlotP->Next, "Free slot.");
+ SlotP->Next = 0;
+ }
+ }
+}
+
+// Begin a data sharing context. Maintain a list of references to shared
+// variables. This list of references to shared variables will be passed
+// to one or more threads.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
+ omptarget_nvptx_globalArgs.EnsureSize(nArgs);
+ *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
+
+// End a data sharing context. There is no need to have a list of refs
+// to shared variables because the context in which those variables were
+// shared has now ended. This should clean-up the list of references only
+// without affecting the actual global storage of the variables.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_end_sharing_variables() {
+ omptarget_nvptx_globalArgs.DeInit();
+}
+
+// This function will return a list of references to global variables. This
+// is how the workers will get a reference to the globalized variable. The
+// members of this list will be passed to the outlined parallel function
+// preserving the order.
+// Called by all workers.
+EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
+ *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
+
+// This function is used to init static memory manager. This manager is used to
+// manage statically allocated global memory. This memory is allocated by the
+// compiler and used to correctly implement globalization of the variables in
+// target, teams and distribute regions.
+EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
+ const void *buf, size_t size,
+ int16_t is_shared,
+ const void **frame) {
+ if (is_shared) {
+ *frame = buf;
+ return;
+ }
+ if (isSPMDExecutionMode) {
+ if (GetThreadIdInBlock() == 0) {
+ *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
+ }
+ // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
+ __SYNCTHREADS();
+ return;
+ }
+ ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+ "Must be called only in the target master thread.");
+ *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
+ __threadfence();
+}
+
+EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
+ int16_t is_shared) {
+ if (is_shared)
+ return;
+ if (isSPMDExecutionMode) {
+ // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
+ __SYNCTHREADS();
+ if (GetThreadIdInBlock() == 0) {
+ omptarget_nvptx_simpleMemoryManager.Release();
+ }
+ return;
+ }
+ __threadfence();
+ ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+ "Must be called only in the target master thread.");
+ omptarget_nvptx_simpleMemoryManager.Release();
+}
+