diff options
author | Alexey Bataev <a.bataev@hotmail.com> | 2018-06-25 13:43:35 +0000 |
---|---|---|
committer | Alexey Bataev <a.bataev@hotmail.com> | 2018-06-25 13:43:35 +0000 |
commit | 0cf8dceda6606d9a3f19f1049a9698682680ce09 (patch) | |
tree | e178e7b85ef21df449655a4008ca33e2914531a7 | |
parent | 1fdab7f48cf43e77c09dc34aa7b79152ded9a6f8 (diff) |
[OPENMP, NVPTX] Fixes for NVPTX RTL
Summary:
Patch fixes several problems in the implementation of NVPTX RTL.
1. Detection of the last iteration for loops with static scheduling, no chunks.
2. Fixes reductions for the serialized parallel constructs.
3. Fixes handling of the barriers.
Reviewers: grokos
Reviewed By: grokos
Subscribers: Hahnfeld, guansong, openmp-commits
Differential Revision: https://reviews.llvm.org/D48480
git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@335469 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/loop.cu | 2 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/reduction.cu | 12 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/sync.cu | 54 |
3 files changed, 36 insertions, 32 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu index 80fb61f..0e808df 100644 --- a/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -86,7 +86,7 @@ public: T inputUb = ub; ub = lb + chunk - 1; // Clang uses i <= ub - last = ub == inputUb; + last = lb <= inputUb && inputUb <= ub; stride = loopSize; // make sure we only do 1 chunk per warp } diff --git a/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/libomptarget/deviceRTLs/nvptx/src/reduction.cu index afa8e81..aedb635 100644 --- a/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ b/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -161,6 +161,11 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, kmp_InterWarpCopyFctPtr cpyFct, bool isSPMDExecutionMode, bool isRuntimeUninitialized = false) { + uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); + uint32_t NumThreads = GetNumberOfOmpThreads( + BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); + if (NumThreads == 1) + return 1; /* * This reduce function handles reduction within a team. It handles * parallel regions in both L1 and L2 parallelism levels. It also @@ -173,9 +178,6 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, */ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); - uint32_t NumThreads = GetNumberOfOmpThreads( - BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; uint32_t WarpId = BlockThreadId / WARPSIZE; @@ -219,10 +221,6 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, // early. return gpu_irregular_simd_reduce(reduce_data, shflFct); - uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); - uint32_t NumThreads = GetNumberOfOmpThreads( - BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); - // When we have more than [warpsize] number of threads // a block reduction is performed here. // diff --git a/libomptarget/deviceRTLs/nvptx/src/sync.cu b/libomptarget/deviceRTLs/nvptx/src/sync.cu index a577d7a..68f08a1 100644 --- a/libomptarget/deviceRTLs/nvptx/src/sync.cu +++ b/libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -35,40 +35,46 @@ EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t tid) { EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) { PRINT0(LD_IO, "call kmpc_cancel_barrier\n"); - __syncthreads(); + __kmpc_barrier(loc_ref, tid); PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n"); return 0; } EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) { - tid = GetLogicalThreadIdInBlock(); - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); - if (!currTaskDescr->InL2OrHigherParallelRegion()) { - int numberOfActiveOMPThreads = - GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()); + if (isSPMDMode()) { + __kmpc_barrier_simple_spmd(loc_ref, tid); + } else if (isRuntimeUninitialized()) { + __kmpc_barrier_simple_generic(loc_ref, tid); + } else { + tid = GetLogicalThreadIdInBlock(); + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); + if (!currTaskDescr->InL2OrHigherParallelRegion()) { + int numberOfActiveOMPThreads = + GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - // On Volta and newer architectures we require that all lanes in - // a warp (at least, all present for the kernel launch) participate in the - // barrier. This is enforced when launching the parallel region. An - // exception is when there are < WARPSIZE workers. In this case only 1 - // worker is started, so we don't need a barrier. - if (numberOfActiveOMPThreads > 1) { + // On Volta and newer architectures we require that all lanes in + // a warp (at least, all present for the kernel launch) participate in the + // barrier. This is enforced when launching the parallel region. An + // exception is when there are < WARPSIZE workers. In this case only 1 + // worker is started, so we don't need a barrier. + if (numberOfActiveOMPThreads > 1) { #endif - // The #threads parameter must be rounded up to the WARPSIZE. - int threads = - WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); - - PRINT(LD_SYNC, - "call kmpc_barrier with %d omp threads, sync parameter %d\n", - numberOfActiveOMPThreads, threads); - // Barrier #1 is for synchronization among active threads. - named_sync(L1_BARRIER, threads); + // The #threads parameter must be rounded up to the WARPSIZE. + int threads = + WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); + + PRINT(LD_SYNC, + "call kmpc_barrier with %d omp threads, sync parameter %d\n", + numberOfActiveOMPThreads, threads); + // Barrier #1 is for synchronization among active threads. + named_sync(L1_BARRIER, threads); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - } // numberOfActiveOMPThreads > 1 + } // numberOfActiveOMPThreads > 1 #endif + } + PRINT0(LD_SYNC, "completed kmpc_barrier\n"); } - PRINT0(LD_SYNC, "completed kmpc_barrier\n"); } // Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0 |