[OPNEMP, NVPTX] Fixed sychronization construct + code cleanup.

Summary: 1. Fixed internal problem in `__kmpc_barrier` function: SPMD mode synchronization function should be called only in L1 parallel level. 2. Removed some extra code for synchronization inside of the code, used `__kmpc_barrier` instead. 3. Some code cleanup. Reviewers: gtbercea, grokos Subscribers: openmp-commits Differential Revision: https://reviews.llvm.org/D49564 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@337691 91177308-0d34-0410-b5e6-96231b3b80d8
author: Alexey Bataev <a.bataev@hotmail.com> 2018-07-23 13:52:12 +0000
committer: Alexey Bataev <a.bataev@hotmail.com> 2018-07-23 13:52:12 +0000
commit: 7c9019ff7177253995729bb61a16e6ce65567966 (patch)
tree: 40220f64de964ed02f7ebf510e04e56b8d08f913
parent: d7d72b740038e4285e1d51d6feb48330c0fb6f5a (diff)
4 files changed, 25 insertions, 54 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu
index 60818af..f3e475d 100644
--- a/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -240,12 +240,8 @@ public:
 
     // Process schedule.
     if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
-      if (OrderedSchedule(schedule)) {
-        if (isSPMDMode())
-          __syncthreads();
-        else
-          __kmpc_barrier(loc, threadId);
-      }
+      if (OrderedSchedule(schedule))
+        __kmpc_barrier(loc, threadId);
       PRINT(LD_LOOP,
             "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
             (long)tnum, P64(tripCount), schedule);
@@ -338,10 +334,7 @@ public:
             omptarget_nvptx_threadPrivateContext->Stride(tid));
 
     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      if (isSPMDMode())
-        __syncthreads();
-      else
-        __kmpc_barrier(loc, threadId);
+      __kmpc_barrier(loc, threadId);
       // save sched state
       int teamId = GetOmpTeamId();
       omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
@@ -352,10 +345,7 @@ public:
         omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
         omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
       }
-      if (isSPMDMode())
-        __syncthreads();
-      else
-        __kmpc_barrier(loc, threadId);
+      __kmpc_barrier(loc, threadId);
       PRINT(LD_LOOP,
             "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
             ", chunk %" PRIu64 "\n",
diff --git a/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
index aedb635..b813a11 100644
--- a/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -25,9 +25,8 @@ int32_t __gpu_block_reduce() {
   if (nt != blockDim.x)
     return 0;
   unsigned tnum = __ACTIVEMASK();
-  if (tnum != (~0x0)) { // assume swapSize is 32
+  if (tnum != (~0x0)) // assume swapSize is 32
     return 0;
-  }
   return 1;
 }
 
@@ -48,32 +47,21 @@ int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars,
 
   if (numthread == 1)
     return 1;
-  else if (!__gpu_block_reduce())
+  if (!__gpu_block_reduce())
     return 2;
-  else {
-    if (threadIdx.x == 0)
-      return 1;
-    else
-      return 0;
-  }
+  if (threadIdx.x == 0)
+    return 1;
+  return 0;
 }
 
 EXTERN
 int32_t __kmpc_reduce_combined(kmp_Indent *loc) {
-  if (threadIdx.x == 0) {
-    return 2;
-  } else {
-    return 0;
-  }
+  return threadIdx.x == 0 ? 2 : 0;
 }
 
 EXTERN
 int32_t __kmpc_reduce_simd(kmp_Indent *loc) {
-  if (threadIdx.x % 32 == 0) {
-    return 1;
-  } else {
-    return 0;
-  }
+  return (threadIdx.x % 32 == 0) ? 1 : 0;
 }
 
 EXTERN
diff --git a/libomptarget/deviceRTLs/nvptx/src/supporti.h b/libomptarget/deviceRTLs/nvptx/src/supporti.h
index 2b267c3..4de2039 100644
--- a/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -155,8 +155,7 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
 INLINE int GetNumberOfProcsInDevice() {
   if (isGenericMode())
     return GetNumberOfWorkersInTeam();
-  else
-    return GetNumberOfThreadsInBlock();
+  return GetNumberOfThreadsInBlock();
 }
 
 INLINE int GetNumberOfProcsInTeam() { return GetNumberOfProcsInDevice(); }
diff --git a/libomptarget/deviceRTLs/nvptx/src/sync.cu b/libomptarget/deviceRTLs/nvptx/src/sync.cu
index 68f08a1..7e55df8 100644
--- a/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -41,25 +41,21 @@ EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
 }
 
 EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
-  if (isSPMDMode()) {
-    __kmpc_barrier_simple_spmd(loc_ref, tid);
-  } else if (isRuntimeUninitialized()) {
-    __kmpc_barrier_simple_generic(loc_ref, tid);
+  if (isRuntimeUninitialized()) {
+    if (isSPMDMode())
+      __kmpc_barrier_simple_spmd(loc_ref, tid);
+    else
+      __kmpc_barrier_simple_generic(loc_ref, tid);
   } else {
     tid = GetLogicalThreadIdInBlock();
     omptarget_nvptx_TaskDescr *currTaskDescr =
         omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
-    if (!currTaskDescr->InL2OrHigherParallelRegion()) {
-      int numberOfActiveOMPThreads =
-          GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-      // On Volta and newer architectures we require that all lanes in
-      // a warp (at least, all present for the kernel launch) participate in the
-      // barrier.  This is enforced when launching the parallel region.  An
-      // exception is when there are < WARPSIZE workers.  In this case only 1
-      // worker is started, so we don't need a barrier.
-      if (numberOfActiveOMPThreads > 1) {
-#endif
+    int numberOfActiveOMPThreads = GetNumberOfOmpThreads(
+        tid, isSPMDMode(), /*isRuntimeUninitialized=*/false);
+    if (numberOfActiveOMPThreads > 1) {
+      if (isSPMDMode()) {
+        __kmpc_barrier_simple_spmd(loc_ref, tid);
+      } else {
         // The #threads parameter must be rounded up to the WARPSIZE.
         int threads =
             WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
@@ -69,10 +65,8 @@ EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
               numberOfActiveOMPThreads, threads);
         // Barrier #1 is for synchronization among active threads.
         named_sync(L1_BARRIER, threads);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-      } // numberOfActiveOMPThreads > 1
-#endif
-    }
+      }
+    } // numberOfActiveOMPThreads > 1
     PRINT0(LD_SYNC, "completed kmpc_barrier\n");
   }
 }
author	Alexey Bataev <a.bataev@hotmail.com>	2018-07-23 13:52:12 +0000
committer	Alexey Bataev <a.bataev@hotmail.com>	2018-07-23 13:52:12 +0000
commit	7c9019ff7177253995729bb61a16e6ce65567966 (patch)
tree	40220f64de964ed02f7ebf510e04e56b8d08f913
parent	d7d72b740038e4285e1d51d6feb48330c0fb6f5a (diff)