aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2018-07-23 13:52:12 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2018-07-23 13:52:12 +0000
commit7c9019ff7177253995729bb61a16e6ce65567966 (patch)
tree40220f64de964ed02f7ebf510e04e56b8d08f913
parentd7d72b740038e4285e1d51d6feb48330c0fb6f5a (diff)
[OPNEMP, NVPTX] Fixed sychronization construct + code cleanup.
Summary: 1. Fixed internal problem in `__kmpc_barrier` function: SPMD mode synchronization function should be called only in L1 parallel level. 2. Removed some extra code for synchronization inside of the code, used `__kmpc_barrier` instead. 3. Some code cleanup. Reviewers: gtbercea, grokos Subscribers: openmp-commits Differential Revision: https://reviews.llvm.org/D49564 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@337691 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/loop.cu18
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/reduction.cu26
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/supporti.h3
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/sync.cu32
4 files changed, 25 insertions, 54 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu
index 60818af..f3e475d 100644
--- a/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -240,12 +240,8 @@ public:
// Process schedule.
if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
- if (OrderedSchedule(schedule)) {
- if (isSPMDMode())
- __syncthreads();
- else
- __kmpc_barrier(loc, threadId);
- }
+ if (OrderedSchedule(schedule))
+ __kmpc_barrier(loc, threadId);
PRINT(LD_LOOP,
"go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
(long)tnum, P64(tripCount), schedule);
@@ -338,10 +334,7 @@ public:
omptarget_nvptx_threadPrivateContext->Stride(tid));
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
- if (isSPMDMode())
- __syncthreads();
- else
- __kmpc_barrier(loc, threadId);
+ __kmpc_barrier(loc, threadId);
// save sched state
int teamId = GetOmpTeamId();
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
@@ -352,10 +345,7 @@ public:
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
}
- if (isSPMDMode())
- __syncthreads();
- else
- __kmpc_barrier(loc, threadId);
+ __kmpc_barrier(loc, threadId);
PRINT(LD_LOOP,
"dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
", chunk %" PRIu64 "\n",
diff --git a/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
index aedb635..b813a11 100644
--- a/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -25,9 +25,8 @@ int32_t __gpu_block_reduce() {
if (nt != blockDim.x)
return 0;
unsigned tnum = __ACTIVEMASK();
- if (tnum != (~0x0)) { // assume swapSize is 32
+ if (tnum != (~0x0)) // assume swapSize is 32
return 0;
- }
return 1;
}
@@ -48,32 +47,21 @@ int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars,
if (numthread == 1)
return 1;
- else if (!__gpu_block_reduce())
+ if (!__gpu_block_reduce())
return 2;
- else {
- if (threadIdx.x == 0)
- return 1;
- else
- return 0;
- }
+ if (threadIdx.x == 0)
+ return 1;
+ return 0;
}
EXTERN
int32_t __kmpc_reduce_combined(kmp_Indent *loc) {
- if (threadIdx.x == 0) {
- return 2;
- } else {
- return 0;
- }
+ return threadIdx.x == 0 ? 2 : 0;
}
EXTERN
int32_t __kmpc_reduce_simd(kmp_Indent *loc) {
- if (threadIdx.x % 32 == 0) {
- return 1;
- } else {
- return 0;
- }
+ return (threadIdx.x % 32 == 0) ? 1 : 0;
}
EXTERN
diff --git a/libomptarget/deviceRTLs/nvptx/src/supporti.h b/libomptarget/deviceRTLs/nvptx/src/supporti.h
index 2b267c3..4de2039 100644
--- a/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -155,8 +155,7 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
INLINE int GetNumberOfProcsInDevice() {
if (isGenericMode())
return GetNumberOfWorkersInTeam();
- else
- return GetNumberOfThreadsInBlock();
+ return GetNumberOfThreadsInBlock();
}
INLINE int GetNumberOfProcsInTeam() { return GetNumberOfProcsInDevice(); }
diff --git a/libomptarget/deviceRTLs/nvptx/src/sync.cu b/libomptarget/deviceRTLs/nvptx/src/sync.cu
index 68f08a1..7e55df8 100644
--- a/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -41,25 +41,21 @@ EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
}
EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
- if (isSPMDMode()) {
- __kmpc_barrier_simple_spmd(loc_ref, tid);
- } else if (isRuntimeUninitialized()) {
- __kmpc_barrier_simple_generic(loc_ref, tid);
+ if (isRuntimeUninitialized()) {
+ if (isSPMDMode())
+ __kmpc_barrier_simple_spmd(loc_ref, tid);
+ else
+ __kmpc_barrier_simple_generic(loc_ref, tid);
} else {
tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
- if (!currTaskDescr->InL2OrHigherParallelRegion()) {
- int numberOfActiveOMPThreads =
- GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- // On Volta and newer architectures we require that all lanes in
- // a warp (at least, all present for the kernel launch) participate in the
- // barrier. This is enforced when launching the parallel region. An
- // exception is when there are < WARPSIZE workers. In this case only 1
- // worker is started, so we don't need a barrier.
- if (numberOfActiveOMPThreads > 1) {
-#endif
+ int numberOfActiveOMPThreads = GetNumberOfOmpThreads(
+ tid, isSPMDMode(), /*isRuntimeUninitialized=*/false);
+ if (numberOfActiveOMPThreads > 1) {
+ if (isSPMDMode()) {
+ __kmpc_barrier_simple_spmd(loc_ref, tid);
+ } else {
// The #threads parameter must be rounded up to the WARPSIZE.
int threads =
WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
@@ -69,10 +65,8 @@ EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
numberOfActiveOMPThreads, threads);
// Barrier #1 is for synchronization among active threads.
named_sync(L1_BARRIER, threads);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- } // numberOfActiveOMPThreads > 1
-#endif
- }
+ }
+ } // numberOfActiveOMPThreads > 1
PRINT0(LD_SYNC, "completed kmpc_barrier\n");
}
}