diff options
Diffstat (limited to 'final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h')
-rw-r--r-- | final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h | 218 |
1 files changed, 218 insertions, 0 deletions
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h new file mode 100644 index 0000000..086f4c5 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h @@ -0,0 +1,218 @@ +//===---- omptarget-nvptxi.h - NVPTX OpenMP GPU initialization --- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of all library macros, types, +// and functions. +// +//===----------------------------------------------------------------------===// + +//////////////////////////////////////////////////////////////////////////////// +// Task Descriptor +//////////////////////////////////////////////////////////////////////////////// + +INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() { + // sched starts from 1..4; encode it as 0..3; so add 1 here + uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1; + return (omp_sched_t)rc; +} + +INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) { + // sched starts from 1..4; encode it as 0..3; so sub 1 here + uint8_t val = ((uint8_t)sched) - 1; + // clear current sched + items.flags &= ~TaskDescr_SchedMask; + // set new sched + items.flags |= val; +} + +INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() { + // slow method + // flag: + // default sched is static, + // dyn is off (unused now anyway, but may need to sample from host ?) + // not in parallel + + items.flags = 0; + items.nthreads = GetNumberOfProcsInTeam(); + ; // threads: whatever was alloc by kernel + items.threadId = 0; // is master + items.threadsInTeam = 1; // sequential + items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1 +} + +// This is called when all threads are started together in SPMD mode. +// OMP directives include target parallel, target distribute parallel for, etc. +INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr( + uint16_t tnum, omptarget_nvptx_TaskDescr *parentTaskDescr) { + // slow method + // flag: + // default sched is static, + // dyn is off (unused now anyway, but may need to sample from host ?) + // in L1 parallel + + items.flags = + TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel + items.nthreads = 0; // # threads for subsequent parallel region + items.threadId = + GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) + items.threadsInTeam = tnum; + items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1 + prev = parentTaskDescr; +} + +INLINE void omptarget_nvptx_TaskDescr::CopyData( + omptarget_nvptx_TaskDescr *sourceTaskDescr) { + items = sourceTaskDescr->items; +} + +INLINE void +omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) { + CopyData(sourceTaskDescr); + prev = sourceTaskDescr->prev; +} + +INLINE void omptarget_nvptx_TaskDescr::CopyParent( + omptarget_nvptx_TaskDescr *parentTaskDescr) { + CopyData(parentTaskDescr); + prev = parentTaskDescr; +} + +INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask( + omptarget_nvptx_TaskDescr *parentTaskDescr) { + CopyParent(parentTaskDescr); + items.flags = items.flags & ~TaskDescr_IsParConstr; + ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task"); +} + +INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr( + omptarget_nvptx_TaskDescr *masterTaskDescr, uint16_t tnum) { + CopyParent(masterTaskDescr); + // overrwrite specific items; + items.flags |= + TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel + items.threadsInTeam = tnum; // set number of threads +} + +INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr( + omptarget_nvptx_TaskDescr *workTaskDescr) { + Copy(workTaskDescr); + // + // overrwrite specific items; + // + // The threadID should be GetThreadIdInBlock() % GetMasterThreadID(). + // This is so that the serial master (first lane in the master warp) + // gets a threadId of 0. + // However, we know that this function is always called in a parallel + // region where only workers are active. The serial master thread + // never enters this region. When a parallel region is executed serially, + // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions + // are called, which never activate this region. + items.threadId = + GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) +} + +INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent( + omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) { + CopyParent(parentTaskDescr); + items.flags |= TaskDescr_InParL2P; // In L2+ parallelism + items.threadsInTeam = tnum; // set number of threads + items.threadId = tid; +} + +INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() { + loopData.loopUpperBound = + omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId); + loopData.nextLowerBound = + omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId); + loopData.schedule = + omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId); + loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId); + loopData.stride = + omptarget_nvptx_threadPrivateContext->Stride(items.threadId); +} + +INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const { + omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk; + omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) = + loopData.loopUpperBound; + omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) = + loopData.nextLowerBound; + omptarget_nvptx_threadPrivateContext->Stride(items.threadId) = + loopData.stride; + omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) = + loopData.schedule; +} + +//////////////////////////////////////////////////////////////////////////////// +// Thread Private Context +//////////////////////////////////////////////////////////////////////////////// + +INLINE omptarget_nvptx_TaskDescr * +omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) { + ASSERT0( + LT_FUSSY, tid < MAX_THREADS_PER_TEAM, + "Getting top level, tid is larger than allocated data structure size"); + return topTaskDescr[tid]; +} + +INLINE void +omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) { + // levelOneTaskDescr is init when starting the parallel region + // top task descr is NULL (team master version will be fixed separately) + topTaskDescr[tid] = NULL; + // no num threads value has been pushed + nextRegion.tnum[tid] = 0; + // priv counter init to zero + priv[tid] = 0; + // the following don't need to be init here; they are init when using dyn + // sched + // current_Event, events_Number, chunk, num_Iterations, schedule +} + +//////////////////////////////////////////////////////////////////////////////// +// Work Descriptor +//////////////////////////////////////////////////////////////////////////////// + +INLINE void omptarget_nvptx_WorkDescr::InitWorkDescr() { + cg.Clear(); // start and stop to zero too + // threadsInParallelTeam does not need to be init (done in start parallel) + hasCancel = FALSE; +} + +//////////////////////////////////////////////////////////////////////////////// +// Team Descriptor +//////////////////////////////////////////////////////////////////////////////// + +INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() { + levelZeroTaskDescr.InitLevelZeroTaskDescr(); + workDescrForActiveParallel.InitWorkDescr(); + // omp_init_lock(criticalLock); +} + +//////////////////////////////////////////////////////////////////////////////// +// Get private data structure for thread +//////////////////////////////////////////////////////////////////////////////// + +// Utility routines for CUDA threads +INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() { + return omptarget_nvptx_threadPrivateContext->TeamContext(); +} + +INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() { + omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); + return currTeamDescr.WorkDescr(); +} + +INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) { + return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); +} + +INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() { + return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock()); +} |