diff options
Diffstat (limited to 'final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h')
-rw-r--r-- | final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h | 441 |
1 files changed, 441 insertions, 0 deletions
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h new file mode 100644 index 0000000..84c61f9 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -0,0 +1,441 @@ +//===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of all library macros, types, +// and functions. +// +//===----------------------------------------------------------------------===// + +#ifndef __OMPTARGET_NVPTX_H +#define __OMPTARGET_NVPTX_H + +// std includes +#include <stdint.h> +#include <stdlib.h> + +#include <inttypes.h> + +// cuda includes +#include <cuda.h> +#include <math.h> + +// local includes +#include "counter_group.h" +#include "debug.h" // debug +#include "interface.h" // interfaces with omp, compiler, and user +#include "option.h" // choices we have +#include "state-queue.h" +#include "support.h" + +#define OMPTARGET_NVPTX_VERSION 1.1 + +// used by the library for the interface with the app +#define DISPATCH_FINISHED 0 +#define DISPATCH_NOTFINISHED 1 + +// used by dynamic scheduling +#define FINISHED 0 +#define NOT_FINISHED 1 +#define LAST_CHUNK 2 + +#define BARRIER_COUNTER 0 +#define ORDERED_COUNTER 1 + +// Macros for Cuda intrinsics +// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. +// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask(). +#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 +#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane)) +#define __SHFL_DOWN_SYNC(mask, var, delta, width) \ + __shfl_down_sync((mask), (var), (delta), (width)) +#define __BALLOT_SYNC(mask, predicate) __ballot_sync((mask), (predicate)) +#define __ACTIVEMASK() __activemask() +#else +#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane)) +#define __SHFL_DOWN_SYNC(mask, var, delta, width) \ + __shfl_down((var), (delta), (width)) +#define __BALLOT_SYNC(mask, predicate) __ballot((predicate)) +#define __ACTIVEMASK() __ballot(1) +#endif + +// arguments needed for L0 parallelism only. +class omptarget_nvptx_SharedArgs { +public: + // All these methods must be called by the master thread only. + INLINE void Init() { + args = buffer; + nArgs = MAX_SHARED_ARGS; + } + INLINE void DeInit() { + // Free any memory allocated for outlined parallel function with a large + // number of arguments. + if (nArgs > MAX_SHARED_ARGS) { + SafeFree(args, (char *)"new extended args"); + Init(); + } + } + INLINE void EnsureSize(size_t size) { + if (size > nArgs) { + if (nArgs > MAX_SHARED_ARGS) { + SafeFree(args, (char *)"new extended args"); + } + args = (void **) SafeMalloc(size * sizeof(void *), + (char *)"new extended args"); + nArgs = size; + } + } + // Called by all threads. + INLINE void **GetArgs() { return args; }; +private: + // buffer of pre-allocated arguments. + void *buffer[MAX_SHARED_ARGS]; + // pointer to arguments buffer. + // starts off as a pointer to 'buffer' but can be dynamically allocated. + void **args; + // starts off as MAX_SHARED_ARGS but can increase in size. + uint32_t nArgs; +}; + +extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; + +// Data sharing related quantities, need to match what is used in the compiler. +enum DATA_SHARING_SIZES { + // The maximum number of workers in a kernel. + DS_Max_Worker_Threads = 992, + // The size reserved for data in a shared memory slot. + DS_Slot_Size = 256, + // The slot size that should be reserved for a working warp. + DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, + // The maximum number of warps in use + DS_Max_Warp_Number = 32, +}; + +// Data structure to keep in shared memory that traces the current slot, stack, +// and frame pointer as well as the active threads that didn't exit the current +// environment. +struct DataSharingStateTy { + __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; + void *StackPtr[DS_Max_Warp_Number]; + void *FramePtr[DS_Max_Warp_Number]; + int32_t ActiveThreads[DS_Max_Warp_Number]; +}; +// Additional worker slot type which is initialized with the default worker slot +// size of 4*32 bytes. +struct __kmpc_data_sharing_worker_slot_static { + __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; + void *DataEnd; + char Data[DS_Worker_Warp_Slot_Size]; +}; +// Additional master slot type which is initialized with the default master slot +// size of 4 bytes. +struct __kmpc_data_sharing_master_slot_static { + __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; + void *DataEnd; + char Data[DS_Slot_Size]; +}; +extern __device__ __shared__ DataSharingStateTy DataSharingState; + +//////////////////////////////////////////////////////////////////////////////// +// task ICV and (implicit & explicit) task state + +class omptarget_nvptx_TaskDescr { +public: + // methods for flags + INLINE omp_sched_t GetRuntimeSched(); + INLINE void SetRuntimeSched(omp_sched_t sched); + INLINE int IsDynamic() { return items.flags & TaskDescr_IsDynamic; } + INLINE void SetDynamic() { + items.flags = items.flags | TaskDescr_IsDynamic; + } + INLINE void ClearDynamic() { + items.flags = items.flags & (~TaskDescr_IsDynamic); + } + INLINE int InParallelRegion() { return items.flags & TaskDescr_InPar; } + INLINE int InL2OrHigherParallelRegion() { + return items.flags & TaskDescr_InParL2P; + } + INLINE int IsParallelConstruct() { + return items.flags & TaskDescr_IsParConstr; + } + INLINE int IsTaskConstruct() { return !IsParallelConstruct(); } + // methods for other fields + INLINE uint16_t &NThreads() { return items.nthreads; } + INLINE uint16_t &ThreadLimit() { return items.threadlimit; } + INLINE uint16_t &ThreadId() { return items.threadId; } + INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; } + INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } + INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; } + INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { + prev = taskDescr; + } + // init & copy + INLINE void InitLevelZeroTaskDescr(); + INLINE void InitLevelOneTaskDescr(uint16_t tnum, + omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); + INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); + INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr, + uint16_t tnum); + INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); + INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, + uint16_t tid, uint16_t tnum); + INLINE void SaveLoopData(); + INLINE void RestoreLoopData() const; + +private: + // bits for flags: (7 used, 1 free) + // 3 bits (SchedMask) for runtime schedule + // 1 bit (IsDynamic) for dynamic schedule (false = static) + // 1 bit (InPar) if this thread has encountered one or more parallel region + // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) + // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel + // region + static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); + static const uint8_t TaskDescr_IsDynamic = 0x8; + static const uint8_t TaskDescr_InPar = 0x10; + static const uint8_t TaskDescr_IsParConstr = 0x20; + static const uint8_t TaskDescr_InParL2P = 0x40; + + struct SavedLoopDescr_items { + int64_t loopUpperBound; + int64_t nextLowerBound; + int64_t chunk; + int64_t stride; + kmp_sched_t schedule; + } loopData; + + struct TaskDescr_items { + uint8_t flags; // 6 bit used (see flag above) + uint8_t unused; + uint16_t nthreads; // thread num for subsequent parallel regions + uint16_t threadlimit; // thread limit ICV + uint16_t threadId; // thread id + uint16_t threadsInTeam; // threads in current team + uint64_t runtimeChunkSize; // runtime chunk size + } items; + omptarget_nvptx_TaskDescr *prev; +}; + +// build on kmp +typedef struct omptarget_nvptx_ExplicitTaskDescr { + omptarget_nvptx_TaskDescr + taskDescr; // omptarget_nvptx task description (must be first) + kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last) +} omptarget_nvptx_ExplicitTaskDescr; + +//////////////////////////////////////////////////////////////////////////////// +// Descriptor of a parallel region (worksharing in general) + +class omptarget_nvptx_WorkDescr { + +public: + // access to data + INLINE omptarget_nvptx_CounterGroup &CounterGroup() { return cg; } + INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } + // init + INLINE void InitWorkDescr(); + +private: + omptarget_nvptx_CounterGroup cg; // for barrier (no other needed) + omptarget_nvptx_TaskDescr masterTaskICV; + bool hasCancel; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class omptarget_nvptx_TeamDescr { +public: + // access to data + INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { + return &levelZeroTaskDescr; + } + INLINE omptarget_nvptx_WorkDescr &WorkDescr() { + return workDescrForActiveParallel; + } + INLINE omp_lock_t *CriticalLock() { return &criticalLock; } + INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; } + + // init + INLINE void InitTeamDescr(); + + INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) { + // If this is invoked by the master thread of the master warp then intialize + // it with a smaller slot. + if (IsMasterThread) { + // Do not initalize this slot again if it has already been initalized. + if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size) + return 0; + // Initialize the pointer to the end of the slot given the size of the + // data section. DataEnd is non-inclusive. + master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size; + // We currently do not have a next slot. + master_rootS[0].Next = 0; + master_rootS[0].Prev = 0; + master_rootS[0].PrevSlotStackPtr = 0; + return (__kmpc_data_sharing_slot *)&master_rootS[0]; + } + // Do not initalize this slot again if it has already been initalized. + if (worker_rootS[wid].DataEnd == + &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size) + return 0; + // Initialize the pointer to the end of the slot given the size of the data + // section. DataEnd is non-inclusive. + worker_rootS[wid].DataEnd = + &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; + // We currently do not have a next slot. + worker_rootS[wid].Next = 0; + worker_rootS[wid].Prev = 0; + worker_rootS[wid].PrevSlotStackPtr = 0; + return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; + } + + INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) { + worker_rootS[wid].DataEnd = + &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; + // We currently do not have a next slot. + worker_rootS[wid].Next = 0; + worker_rootS[wid].Prev = 0; + worker_rootS[wid].PrevSlotStackPtr = 0; + return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; + } + +private: + omptarget_nvptx_TaskDescr + levelZeroTaskDescr; // icv for team master initial thread + omptarget_nvptx_WorkDescr + workDescrForActiveParallel; // one, ONLY for the active par + omp_lock_t criticalLock; + uint64_t lastprivateIterBuffer; + + __align__(16) + __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE]; + __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; +}; + +//////////////////////////////////////////////////////////////////////////////// +// thread private data (struct of arrays for better coalescing) +// tid refers here to the global thread id +// do not support multiple concurrent kernel a this time +class omptarget_nvptx_ThreadPrivateContext { +public: + // task + INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { + return &levelOneTaskDescr[tid]; + } + INLINE void SetTopLevelTaskDescr(int tid, + omptarget_nvptx_TaskDescr *taskICV) { + topTaskDescr[tid] = taskICV; + } + INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid); + // parallel + INLINE uint16_t &NumThreadsForNextParallel(int tid) { + return nextRegion.tnum[tid]; + } + // simd + INLINE uint16_t &SimdLimitForNextSimd(int tid) { + return nextRegion.slim[tid]; + } + // sync + INLINE Counter &Priv(int tid) { return priv[tid]; } + INLINE void IncrementPriv(int tid, Counter val) { priv[tid] += val; } + // schedule (for dispatch) + INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } + INLINE int64_t &Chunk(int tid) { return chunk[tid]; } + INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } + INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; } + INLINE int64_t &Stride(int tid) { return stride[tid]; } + + INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } + + INLINE void InitThreadPrivateContext(int tid); + INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; } + INLINE uint64_t GetSourceQueue() { return SourceQueue; } + +private: + // team context for this team + omptarget_nvptx_TeamDescr teamContext; + // task ICV for implict threads in the only parallel region + omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; + // pointer where to find the current task ICV (top of the stack) + omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; + union { + // Only one of the two is live at the same time. + // parallel + uint16_t tnum[MAX_THREADS_PER_TEAM]; + // simd limit + uint16_t slim[MAX_THREADS_PER_TEAM]; + } nextRegion; + // sync + Counter priv[MAX_THREADS_PER_TEAM]; + // schedule (for dispatch) + kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for + int64_t chunk[MAX_THREADS_PER_TEAM]; + int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; + // state for dispatch with dyn/guided OR static (never use both at a time) + int64_t nextLowerBound[MAX_THREADS_PER_TEAM]; + int64_t stride[MAX_THREADS_PER_TEAM]; + // Queue to which this object must be returned. + uint64_t SourceQueue; +}; + +/// Device envrionment data +struct omptarget_device_environmentTy { + int32_t debug_level; +}; + +//////////////////////////////////////////////////////////////////////////////// +// global device envrionment +//////////////////////////////////////////////////////////////////////////////// + +extern __device__ omptarget_device_environmentTy omptarget_device_environment; + +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// global data tables +//////////////////////////////////////////////////////////////////////////////// + +extern __device__ __shared__ + omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; +extern __device__ __shared__ uint32_t execution_param; +extern __device__ __shared__ void *ReductionScratchpadPtr; + +//////////////////////////////////////////////////////////////////////////////// +// work function (outlined parallel/simd functions) and arguments. +// needed for L1 parallelism only. +//////////////////////////////////////////////////////////////////////////////// + +typedef void *omptarget_nvptx_WorkFn; +extern volatile __device__ __shared__ omptarget_nvptx_WorkFn + omptarget_nvptx_workFn; + +//////////////////////////////////////////////////////////////////////////////// +// get private data structures +//////////////////////////////////////////////////////////////////////////////// + +INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); +INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); +INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(); +INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); + +//////////////////////////////////////////////////////////////////////////////// +// inlined implementation +//////////////////////////////////////////////////////////////////////////////// + +#include "counter_groupi.h" +#include "omptarget-nvptxi.h" +#include "supporti.h" + +#endif |