aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2019-01-07 14:25:25 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2019-01-07 14:25:25 +0000
commit12eef2767778bdadcdda41293c6516aec3f960c1 (patch)
tree77062198c3a62674c7ac933b846bcf0a1154f0a6
parent38895e681eba872ed3a16e6428366cdcb398d61c (diff)
[OPENMP][NVPTX]Fix dynamic scheduling.
Summary: Previous implementation may cause the runtime crash when the number of teams is > 1024. Patch fixes this problem + reduces number of the atomic operations by 32 times. Reviewers: grokos, gtbercea, kkwli0 Subscribers: guansong, jfb, openmp-commits, caomhin Differential Revision: https://reviews.llvm.org/D56332 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@350524 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/loop.cu67
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h2
2 files changed, 47 insertions, 22 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu
index b8a61a4..998ce54 100644
--- a/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -352,18 +352,18 @@ public:
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
(unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
tid));
-
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
- __kmpc_barrier(loc, threadId);
- // save sched state
- int teamId = GetOmpTeamId();
+ // save data
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
- if (GetThreadIdInBlock() == 0) {
- if (chunk < 1)
- chunk = 1;
- omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
- omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
+ if (chunk < 1)
+ chunk = 1;
+ omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+ __kmpc_barrier(loc, threadId);
+ if (tid == 0) {
+ omptarget_nvptx_threadPrivateContext->Cnt() = 0;
+ __threadfence_block();
}
__kmpc_barrier(loc, threadId);
PRINT(LD_LOOP,
@@ -371,21 +371,45 @@ public:
", chunk %" PRIu64 "\n",
(int)tnum,
(unsigned long long)
- omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
- omptarget_nvptx_threadPrivateContext->Chunk(teamId));
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+ omptarget_nvptx_threadPrivateContext->Chunk(tid));
}
}
////////////////////////////////////////////////////////////////////////////////
// Support for dispatch next
+ INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
+ int lo, hi;
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+ hi = __SHFL_SYNC(active, hi, leader);
+ lo = __SHFL_SYNC(active, lo, leader);
+ asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
+ return val;
+ }
+
+ INLINE static uint64_t NextIter() {
+ unsigned int active = __ACTIVEMASK();
+ int leader = __ffs(active) - 1;
+ int change = __popc(active);
+ unsigned lane_mask_lt;
+ asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));
+ unsigned int rank = __popc(active & lane_mask_lt);
+ uint64_t warp_res;
+ if (rank == 0) {
+ warp_res = atomicAdd(
+ (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
+ change);
+ }
+ warp_res = Shuffle(active, warp_res, leader);
+ return warp_res + rank;
+ }
+
INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
- int64_t &loopLowerBound,
- T loopUpperBound) {
- // calculate lower bound for all lanes in the warp
- lb = atomicAdd((unsigned long long *)&loopLowerBound,
- (unsigned long long)chunkSize);
+ T loopLowerBound, T loopUpperBound) {
+ T N = NextIter();
+ lb = loopLowerBound + N * chunkSize;
ub = lb + chunkSize - 1; // Clang uses i <= ub
// 3 result cases:
@@ -461,11 +485,10 @@ public:
schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
"bad sched");
T myLb, myUb;
- int teamId = GetOmpTeamId();
int finished = DynamicNextChunk(
- myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId),
- omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId));
+ myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
if (finished == FINISHED)
return DISPATCH_FINISHED;
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 7a05d93..cb6c0b7 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -344,6 +344,7 @@ public:
INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
INLINE void InitThreadPrivateContext(int tid);
+ INLINE uint64_t &Cnt() { return cnt; }
private:
// team context for this team
@@ -366,6 +367,7 @@ private:
// state for dispatch with dyn/guided OR static (never use both at a time)
int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
int64_t stride[MAX_THREADS_PER_TEAM];
+ uint64_t cnt;
};
/// Device envrionment data