//===---- parallel.cu - NVPTX OpenMP parallel implementation ----- CUDA -*-===// // // The LLVM Compiler Infrastructure // // This file is dual licensed under the MIT and the University of Illinois Open // Source Licenses. See LICENSE.txt for details. // //===----------------------------------------------------------------------===// // // Parallel implemention in the GPU. Here is the pattern: // // while (not finished) { // // if (master) { // sequential code, decide which par loop to do, or if finished // __kmpc_kernel_prepare_parallel() // exec by master only // } // syncthreads // A // __kmpc_kernel_parallel() // exec by all // if (this thread is included in the parallel) { // switch () for all parallel loops // __kmpc_kernel_end_parallel() // exec only by threads in parallel // } // // // The reason we don't exec end_parallel for the threads not included // in the parallel loop is that for each barrier in the parallel // region, these non-included threads will cycle through the // syncthread A. Thus they must preserve their current threadId that // is larger than thread in team. // // To make a long story short... // //===----------------------------------------------------------------------===// #include "omptarget-nvptx.h" typedef struct ConvergentSimdJob { omptarget_nvptx_TaskDescr taskDescr; omptarget_nvptx_TaskDescr *convHeadTaskDescr; uint16_t slimForNextSimd; } ConvergentSimdJob; //////////////////////////////////////////////////////////////////////////////// // support for convergent simd (team of threads in a warp only) //////////////////////////////////////////////////////////////////////////////// EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask, bool *IsFinal, int32_t *LaneSource, int32_t *LaneId, int32_t *NumLanes) { PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n"); uint32_t ConvergentMask = Mask; int32_t ConvergentSize = __popc(ConvergentMask); uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); *LaneSource += __ffs(WorkRemaining); *IsFinal = __popc(WorkRemaining) == 1; uint32_t lanemask_lt; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); *LaneId = __popc(ConvergentMask & lanemask_lt); int threadId = GetLogicalThreadIdInBlock(); int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; int32_t SimdLimit = omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId); job->slimForNextSimd = SimdLimit; int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource); // reset simdlimit to avoid propagating to successive #simd if (SimdLimitSource > 0 && threadId == sourceThreadId) omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0; // We cannot have more than the # of convergent threads. if (SimdLimitSource > 0) *NumLanes = min(ConvergentSize, SimdLimitSource); else *NumLanes = ConvergentSize; ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads", *NumLanes); // Set to true for lanes participating in the simd region. bool isActive = false; // Initialize state for active threads. if (*LaneId < *NumLanes) { omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); omptarget_nvptx_TaskDescr *sourceTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr( sourceThreadId); job->convHeadTaskDescr = currTaskDescr; // install top descriptor from the thread for which the lanes are working. omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, sourceTaskDescr); isActive = true; } // requires a memory fence between threads of a warp return isActive; } EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) { PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); // pop stack int threadId = GetLogicalThreadIdInBlock(); ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = job->slimForNextSimd; omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, job->convHeadTaskDescr); } typedef struct ConvergentParallelJob { omptarget_nvptx_TaskDescr taskDescr; omptarget_nvptx_TaskDescr *convHeadTaskDescr; uint16_t tnumForNextPar; } ConvergentParallelJob; //////////////////////////////////////////////////////////////////////////////// // support for convergent parallelism (team of threads in a warp only) //////////////////////////////////////////////////////////////////////////////// EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask, bool *IsFinal, int32_t *LaneSource) { PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n"); uint32_t ConvergentMask = Mask; int32_t ConvergentSize = __popc(ConvergentMask); uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); *LaneSource += __ffs(WorkRemaining); *IsFinal = __popc(WorkRemaining) == 1; uint32_t lanemask_lt; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); uint32_t OmpId = __popc(ConvergentMask & lanemask_lt); int threadId = GetLogicalThreadIdInBlock(); int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; int32_t NumThreadsClause = omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); job->tnumForNextPar = NumThreadsClause; int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource); // reset numthreads to avoid propagating to successive #parallel if (NumThreadsSource > 0 && threadId == sourceThreadId) omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = 0; // We cannot have more than the # of convergent threads. uint16_t NumThreads; if (NumThreadsSource > 0) NumThreads = min(ConvergentSize, NumThreadsSource); else NumThreads = ConvergentSize; ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", NumThreads); // Set to true for workers participating in the parallel region. bool isActive = false; // Initialize state for active threads. if (OmpId < NumThreads) { // init L2 task descriptor and storage for the L1 parallel task descriptor. omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr; ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); omptarget_nvptx_TaskDescr *sourceTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr( sourceThreadId); job->convHeadTaskDescr = currTaskDescr; newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads); // install new top descriptor omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, newTaskDescr); isActive = true; } // requires a memory fence between threads of a warp return isActive; } EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) { PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); // pop stack int threadId = GetLogicalThreadIdInBlock(); ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, job->convHeadTaskDescr); omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = job->tnumForNextPar; } //////////////////////////////////////////////////////////////////////////////// // support for parallel that goes parallel (1 static level only) //////////////////////////////////////////////////////////////////////////////// // return number of cuda threads that participate to parallel // calculation has to consider simd implementation in nvptx // i.e. (num omp threads * num lanes) // // cudathreads = // if(num_threads != 0) { // if(thread_limit > 0) { // min (num_threads*numLanes ; thread_limit*numLanes); // } else { // min (num_threads*numLanes; blockDim.x) // } // } else { // if (thread_limit != 0) { // min (thread_limit*numLanes; blockDim.x) // } else { // no thread_limit, no num_threads, use all cuda threads // blockDim.x; // } // } // // This routine is always called by the team master.. EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized) { PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n"); omptarget_nvptx_workFn = WorkFn; if (!IsOMPRuntimeInitialized) return; // This routine is only called by the team master. The team master is // the first thread of the last warp. It always has the logical thread // id of 0 (since it is a shadow for the first worker thread). int threadId = 0; omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(), "cannot be called in a parallel region."); if (currTaskDescr->InParallelRegion()) { PRINT0(LD_PAR, "already in parallel: go seq\n"); return; } uint16_t CudaThreadsForParallel = 0; uint16_t NumThreadsClause = omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); // we cannot have more than block size uint16_t CudaThreadsAvail = GetNumberOfWorkersInTeam(); // currTaskDescr->ThreadLimit(): If non-zero, this is the limit as // specified by the thread_limit clause on the target directive. // GetNumberOfWorkersInTeam(): This is the number of workers available // in this kernel instance. // // E.g: If thread_limit is 33, the kernel is launched with 33+32=65 // threads. The last warp is the master warp so in this case // GetNumberOfWorkersInTeam() returns 64. // this is different from ThreadAvail of OpenMP because we may be // using some of the CUDA threads as SIMD lanes int NumLanes = 1; if (NumThreadsClause != 0) { // reset request to avoid propagating to successive #parallel omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = 0; // assume that thread_limit*numlanes is already <= CudaThreadsAvail // because that is already checked on the host side (CUDA offloading rtl) if (currTaskDescr->ThreadLimit() != 0) CudaThreadsForParallel = NumThreadsClause * NumLanes < currTaskDescr->ThreadLimit() * NumLanes ? NumThreadsClause * NumLanes : currTaskDescr->ThreadLimit() * NumLanes; else { CudaThreadsForParallel = (NumThreadsClause * NumLanes > CudaThreadsAvail) ? CudaThreadsAvail : NumThreadsClause * NumLanes; } } else { if (currTaskDescr->ThreadLimit() != 0) { CudaThreadsForParallel = (currTaskDescr->ThreadLimit() * NumLanes > CudaThreadsAvail) ? CudaThreadsAvail : currTaskDescr->ThreadLimit() * NumLanes; } else CudaThreadsForParallel = CudaThreadsAvail; } #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 // On Volta and newer architectures we require that all lanes in // a warp participate in the parallel region. Round down to a // multiple of WARPSIZE since it is legal to do so in OpenMP. // CudaThreadsAvail is the number of workers available in this // kernel instance and is greater than or equal to // currTaskDescr->ThreadLimit(). if (CudaThreadsForParallel < CudaThreadsAvail) { CudaThreadsForParallel = (CudaThreadsForParallel < WARPSIZE) ? 1 : CudaThreadsForParallel & ~((uint16_t)WARPSIZE - 1); } #endif ASSERT(LT_FUSSY, CudaThreadsForParallel > 0, "bad thread request of %d threads", CudaThreadsForParallel); ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), "only team master can create parallel"); // set number of threads on work descriptor // this is different from the number of cuda threads required for the parallel // region omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr, CudaThreadsForParallel / NumLanes); // init counters (copy start to init) workDescr.CounterGroup().Reset(); } // All workers call this function. Deactivate those not needed. // Fn - the outlined work function to execute. // returns True if this thread is active, else False. // // Only the worker threads call this routine. EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized) { PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n"); // Work function and arguments for L1 parallel region. *WorkFn = omptarget_nvptx_workFn; if (!IsOMPRuntimeInitialized) return true; // If this is the termination signal from the master, quit early. if (!*WorkFn) return false; // Only the worker threads call this routine and the master warp // never arrives here. Therefore, use the nvptx thread id. int threadId = GetThreadIdInBlock(); omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); // Set to true for workers participating in the parallel region. bool isActive = false; // Initialize state for active threads. if (threadId < workDescr.WorkTaskDescr()->ThreadsInTeam()) { // init work descriptor from workdesccr omptarget_nvptx_TaskDescr *newTaskDescr = omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr()); // install new top descriptor omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, newTaskDescr); // init private from int value workDescr.CounterGroup().Init( omptarget_nvptx_threadPrivateContext->Priv(threadId)); PRINT(LD_PAR, "thread will execute parallel region with id %d in a team of " "%d threads\n", newTaskDescr->ThreadId(), newTaskDescr->NThreads()); isActive = true; } return isActive; } EXTERN void __kmpc_kernel_end_parallel() { // pop stack PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n"); // Only the worker threads call this routine and the master warp // never arrives here. Therefore, use the nvptx thread id. int threadId = GetThreadIdInBlock(); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, currTaskDescr->GetPrevTaskDescr()); } //////////////////////////////////////////////////////////////////////////////// // support for parallel that goes sequential //////////////////////////////////////////////////////////////////////////////// EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n"); // assume this is only called for nested parallel int threadId = GetLogicalThreadIdInBlock(); // unlike actual parallel, threads in the same team do not share // the workTaskDescr in this case and num threads is fixed to 1 // get current task omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); currTaskDescr->SaveLoopData(); // allocate new task descriptor and copy value from current one, set prev to // it omptarget_nvptx_TaskDescr *newTaskDescr = (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr), (char *)"new seq parallel task"); newTaskDescr->CopyParent(currTaskDescr); // tweak values for serialized parallel case: // - each thread becomes ID 0 in its serialized parallel, and // - there is only one thread per team newTaskDescr->ThreadId() = 0; newTaskDescr->ThreadsInTeam() = 1; // set new task descriptor as top omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, newTaskDescr); } EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n"); // pop stack int threadId = GetLogicalThreadIdInBlock(); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); // set new top omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, currTaskDescr->GetPrevTaskDescr()); // free SafeFree(currTaskDescr, (char *)"new seq parallel task"); currTaskDescr = getMyTopTaskDescriptor(threadId); currTaskDescr->RestoreLoopData(); } EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_parallel_level\n"); int threadId = GetLogicalThreadIdInBlock(); omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); if (currTaskDescr->InL2OrHigherParallelRegion()) return 2; else if (currTaskDescr->InParallelRegion()) return 1; else return 0; } // This kmpc call returns the thread id across all teams. It's value is // cached by the compiler and used when calling the runtime. On nvptx // it's cheap to recalculate this value so we never use the result // of this call. EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc) { return GetLogicalThreadIdInBlock(); } //////////////////////////////////////////////////////////////////////////////// // push params //////////////////////////////////////////////////////////////////////////////// EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t tid, int32_t num_threads) { PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads); tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) = num_threads; } EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t tid, int32_t simd_limit) { PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit); tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit; } // Do nothing. The host guarantees we started the requested number of // teams and we only need inspection of gridDim. EXTERN void __kmpc_push_num_teams(kmp_Indent *loc, int32_t tid, int32_t num_teams, int32_t thread_limit) { PRINT(LD_IO, "call kmpc_push_num_teams %d\n", num_teams); ASSERT0(LT_FUSSY, FALSE, "should never have anything with new teams on device"); } EXTERN void __kmpc_push_proc_bind(kmp_Indent *loc, uint32_t tid, int proc_bind) { PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", proc_bind); }