diff options
Diffstat (limited to 'final/libomptarget/deviceRTLs/nvptx/src/parallel.cu')
-rw-r--r-- | final/libomptarget/deviceRTLs/nvptx/src/parallel.cu | 479 |
1 files changed, 479 insertions, 0 deletions
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/final/libomptarget/deviceRTLs/nvptx/src/parallel.cu new file mode 100644 index 0000000..33509b6 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -0,0 +1,479 @@ +//===---- parallel.cu - NVPTX OpenMP parallel implementation ----- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Parallel implemention in the GPU. Here is the pattern: +// +// while (not finished) { +// +// if (master) { +// sequential code, decide which par loop to do, or if finished +// __kmpc_kernel_prepare_parallel() // exec by master only +// } +// syncthreads // A +// __kmpc_kernel_parallel() // exec by all +// if (this thread is included in the parallel) { +// switch () for all parallel loops +// __kmpc_kernel_end_parallel() // exec only by threads in parallel +// } +// +// +// The reason we don't exec end_parallel for the threads not included +// in the parallel loop is that for each barrier in the parallel +// region, these non-included threads will cycle through the +// syncthread A. Thus they must preserve their current threadId that +// is larger than thread in team. +// +// To make a long story short... +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +typedef struct ConvergentSimdJob { + omptarget_nvptx_TaskDescr taskDescr; + omptarget_nvptx_TaskDescr *convHeadTaskDescr; + uint16_t slimForNextSimd; +} ConvergentSimdJob; + +//////////////////////////////////////////////////////////////////////////////// +// support for convergent simd (team of threads in a warp only) +//////////////////////////////////////////////////////////////////////////////// +EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask, + bool *IsFinal, int32_t *LaneSource, + int32_t *LaneId, int32_t *NumLanes) { + PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n"); + uint32_t ConvergentMask = Mask; + int32_t ConvergentSize = __popc(ConvergentMask); + uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); + *LaneSource += __ffs(WorkRemaining); + *IsFinal = __popc(WorkRemaining) == 1; + uint32_t lanemask_lt; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); + *LaneId = __popc(ConvergentMask & lanemask_lt); + + int threadId = GetLogicalThreadIdInBlock(); + int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; + + ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; + int32_t SimdLimit = + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId); + job->slimForNextSimd = SimdLimit; + + int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource); + // reset simdlimit to avoid propagating to successive #simd + if (SimdLimitSource > 0 && threadId == sourceThreadId) + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0; + + // We cannot have more than the # of convergent threads. + if (SimdLimitSource > 0) + *NumLanes = min(ConvergentSize, SimdLimitSource); + else + *NumLanes = ConvergentSize; + ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads", + *NumLanes); + + // Set to true for lanes participating in the simd region. + bool isActive = false; + // Initialize state for active threads. + if (*LaneId < *NumLanes) { + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + omptarget_nvptx_TaskDescr *sourceTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr( + sourceThreadId); + job->convHeadTaskDescr = currTaskDescr; + // install top descriptor from the thread for which the lanes are working. + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + sourceTaskDescr); + isActive = true; + } + + // requires a memory fence between threads of a warp + return isActive; +} + +EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); + // pop stack + int threadId = GetLogicalThreadIdInBlock(); + ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = + job->slimForNextSimd; + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, job->convHeadTaskDescr); +} + +typedef struct ConvergentParallelJob { + omptarget_nvptx_TaskDescr taskDescr; + omptarget_nvptx_TaskDescr *convHeadTaskDescr; + uint16_t tnumForNextPar; +} ConvergentParallelJob; + +//////////////////////////////////////////////////////////////////////////////// +// support for convergent parallelism (team of threads in a warp only) +//////////////////////////////////////////////////////////////////////////////// +EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask, + bool *IsFinal, + int32_t *LaneSource) { + PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n"); + uint32_t ConvergentMask = Mask; + int32_t ConvergentSize = __popc(ConvergentMask); + uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); + *LaneSource += __ffs(WorkRemaining); + *IsFinal = __popc(WorkRemaining) == 1; + uint32_t lanemask_lt; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); + uint32_t OmpId = __popc(ConvergentMask & lanemask_lt); + + int threadId = GetLogicalThreadIdInBlock(); + int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; + + ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; + int32_t NumThreadsClause = + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); + job->tnumForNextPar = NumThreadsClause; + + int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource); + // reset numthreads to avoid propagating to successive #parallel + if (NumThreadsSource > 0 && threadId == sourceThreadId) + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = + 0; + + // We cannot have more than the # of convergent threads. + uint16_t NumThreads; + if (NumThreadsSource > 0) + NumThreads = min(ConvergentSize, NumThreadsSource); + else + NumThreads = ConvergentSize; + ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", + NumThreads); + + // Set to true for workers participating in the parallel region. + bool isActive = false; + // Initialize state for active threads. + if (OmpId < NumThreads) { + // init L2 task descriptor and storage for the L1 parallel task descriptor. + omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr; + ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + omptarget_nvptx_TaskDescr *sourceTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr( + sourceThreadId); + job->convHeadTaskDescr = currTaskDescr; + newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads); + // install new top descriptor + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + newTaskDescr); + isActive = true; + } + + // requires a memory fence between threads of a warp + return isActive; +} + +EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); + // pop stack + int threadId = GetLogicalThreadIdInBlock(); + ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, job->convHeadTaskDescr); + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = + job->tnumForNextPar; +} + +//////////////////////////////////////////////////////////////////////////////// +// support for parallel that goes parallel (1 static level only) +//////////////////////////////////////////////////////////////////////////////// + +// return number of cuda threads that participate to parallel +// calculation has to consider simd implementation in nvptx +// i.e. (num omp threads * num lanes) +// +// cudathreads = +// if(num_threads != 0) { +// if(thread_limit > 0) { +// min (num_threads*numLanes ; thread_limit*numLanes); +// } else { +// min (num_threads*numLanes; blockDim.x) +// } +// } else { +// if (thread_limit != 0) { +// min (thread_limit*numLanes; blockDim.x) +// } else { // no thread_limit, no num_threads, use all cuda threads +// blockDim.x; +// } +// } +// +// This routine is always called by the team master.. +EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, + int16_t IsOMPRuntimeInitialized) { + PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n"); + omptarget_nvptx_workFn = WorkFn; + + if (!IsOMPRuntimeInitialized) + return; + + // This routine is only called by the team master. The team master is + // the first thread of the last warp. It always has the logical thread + // id of 0 (since it is a shadow for the first worker thread). + int threadId = 0; + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); + ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(), + "cannot be called in a parallel region."); + if (currTaskDescr->InParallelRegion()) { + PRINT0(LD_PAR, "already in parallel: go seq\n"); + return; + } + + uint16_t CudaThreadsForParallel = 0; + uint16_t NumThreadsClause = + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); + + // we cannot have more than block size + uint16_t CudaThreadsAvail = GetNumberOfWorkersInTeam(); + + // currTaskDescr->ThreadLimit(): If non-zero, this is the limit as + // specified by the thread_limit clause on the target directive. + // GetNumberOfWorkersInTeam(): This is the number of workers available + // in this kernel instance. + // + // E.g: If thread_limit is 33, the kernel is launched with 33+32=65 + // threads. The last warp is the master warp so in this case + // GetNumberOfWorkersInTeam() returns 64. + + // this is different from ThreadAvail of OpenMP because we may be + // using some of the CUDA threads as SIMD lanes + int NumLanes = 1; + if (NumThreadsClause != 0) { + // reset request to avoid propagating to successive #parallel + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = + 0; + + // assume that thread_limit*numlanes is already <= CudaThreadsAvail + // because that is already checked on the host side (CUDA offloading rtl) + if (currTaskDescr->ThreadLimit() != 0) + CudaThreadsForParallel = + NumThreadsClause * NumLanes < currTaskDescr->ThreadLimit() * NumLanes + ? NumThreadsClause * NumLanes + : currTaskDescr->ThreadLimit() * NumLanes; + else { + CudaThreadsForParallel = (NumThreadsClause * NumLanes > CudaThreadsAvail) + ? CudaThreadsAvail + : NumThreadsClause * NumLanes; + } + } else { + if (currTaskDescr->ThreadLimit() != 0) { + CudaThreadsForParallel = + (currTaskDescr->ThreadLimit() * NumLanes > CudaThreadsAvail) + ? CudaThreadsAvail + : currTaskDescr->ThreadLimit() * NumLanes; + } else + CudaThreadsForParallel = CudaThreadsAvail; + } + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + // On Volta and newer architectures we require that all lanes in + // a warp participate in the parallel region. Round down to a + // multiple of WARPSIZE since it is legal to do so in OpenMP. + // CudaThreadsAvail is the number of workers available in this + // kernel instance and is greater than or equal to + // currTaskDescr->ThreadLimit(). + if (CudaThreadsForParallel < CudaThreadsAvail) { + CudaThreadsForParallel = + (CudaThreadsForParallel < WARPSIZE) + ? 1 + : CudaThreadsForParallel & ~((uint16_t)WARPSIZE - 1); + } +#endif + + ASSERT(LT_FUSSY, CudaThreadsForParallel > 0, + "bad thread request of %d threads", CudaThreadsForParallel); + ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), + "only team master can create parallel"); + + // set number of threads on work descriptor + // this is different from the number of cuda threads required for the parallel + // region + omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); + workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr, + CudaThreadsForParallel / NumLanes); + // init counters (copy start to init) + workDescr.CounterGroup().Reset(); +} + +// All workers call this function. Deactivate those not needed. +// Fn - the outlined work function to execute. +// returns True if this thread is active, else False. +// +// Only the worker threads call this routine. +EXTERN bool __kmpc_kernel_parallel(void **WorkFn, + int16_t IsOMPRuntimeInitialized) { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n"); + + // Work function and arguments for L1 parallel region. + *WorkFn = omptarget_nvptx_workFn; + + if (!IsOMPRuntimeInitialized) + return true; + + // If this is the termination signal from the master, quit early. + if (!*WorkFn) + return false; + + // Only the worker threads call this routine and the master warp + // never arrives here. Therefore, use the nvptx thread id. + int threadId = GetThreadIdInBlock(); + omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); + // Set to true for workers participating in the parallel region. + bool isActive = false; + // Initialize state for active threads. + if (threadId < workDescr.WorkTaskDescr()->ThreadsInTeam()) { + // init work descriptor from workdesccr + omptarget_nvptx_TaskDescr *newTaskDescr = + omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); + ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); + newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr()); + // install new top descriptor + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + newTaskDescr); + // init private from int value + workDescr.CounterGroup().Init( + omptarget_nvptx_threadPrivateContext->Priv(threadId)); + PRINT(LD_PAR, + "thread will execute parallel region with id %d in a team of " + "%d threads\n", + newTaskDescr->ThreadId(), newTaskDescr->NThreads()); + + isActive = true; + } + + return isActive; +} + +EXTERN void __kmpc_kernel_end_parallel() { + // pop stack + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n"); + // Only the worker threads call this routine and the master warp + // never arrives here. Therefore, use the nvptx thread id. + int threadId = GetThreadIdInBlock(); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, currTaskDescr->GetPrevTaskDescr()); +} + +//////////////////////////////////////////////////////////////////////////////// +// support for parallel that goes sequential +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n"); + + // assume this is only called for nested parallel + int threadId = GetLogicalThreadIdInBlock(); + + // unlike actual parallel, threads in the same team do not share + // the workTaskDescr in this case and num threads is fixed to 1 + + // get current task + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + currTaskDescr->SaveLoopData(); + + // allocate new task descriptor and copy value from current one, set prev to + // it + omptarget_nvptx_TaskDescr *newTaskDescr = + (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr), + (char *)"new seq parallel task"); + newTaskDescr->CopyParent(currTaskDescr); + + // tweak values for serialized parallel case: + // - each thread becomes ID 0 in its serialized parallel, and + // - there is only one thread per team + newTaskDescr->ThreadId() = 0; + newTaskDescr->ThreadsInTeam() = 1; + + // set new task descriptor as top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + newTaskDescr); +} + +EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, + uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n"); + + // pop stack + int threadId = GetLogicalThreadIdInBlock(); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + // set new top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, currTaskDescr->GetPrevTaskDescr()); + // free + SafeFree(currTaskDescr, (char *)"new seq parallel task"); + currTaskDescr = getMyTopTaskDescriptor(threadId); + currTaskDescr->RestoreLoopData(); +} + +EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_parallel_level\n"); + + int threadId = GetLogicalThreadIdInBlock(); + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + if (currTaskDescr->InL2OrHigherParallelRegion()) + return 2; + else if (currTaskDescr->InParallelRegion()) + return 1; + else + return 0; +} + +// This kmpc call returns the thread id across all teams. It's value is +// cached by the compiler and used when calling the runtime. On nvptx +// it's cheap to recalculate this value so we never use the result +// of this call. +EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc) { + return GetLogicalThreadIdInBlock(); +} + +//////////////////////////////////////////////////////////////////////////////// +// push params +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t tid, + int32_t num_threads) { + PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads); + tid = GetLogicalThreadIdInBlock(); + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) = + num_threads; +} + +EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t tid, + int32_t simd_limit) { + PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit); + tid = GetLogicalThreadIdInBlock(); + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit; +} + +// Do nothing. The host guarantees we started the requested number of +// teams and we only need inspection of gridDim. + +EXTERN void __kmpc_push_num_teams(kmp_Indent *loc, int32_t tid, + int32_t num_teams, int32_t thread_limit) { + PRINT(LD_IO, "call kmpc_push_num_teams %d\n", num_teams); + ASSERT0(LT_FUSSY, FALSE, + "should never have anything with new teams on device"); +} + +EXTERN void __kmpc_push_proc_bind(kmp_Indent *loc, uint32_t tid, + int proc_bind) { + PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", proc_bind); +} |