diff options
author | Alexey Bataev <a.bataev@hotmail.com> | 2018-07-12 15:18:28 +0000 |
---|---|---|
committer | Alexey Bataev <a.bataev@hotmail.com> | 2018-07-12 15:18:28 +0000 |
commit | 07d9ad356558ea7700cc1794848a7e327f9192ea (patch) | |
tree | c704bbf42ab229827f47d45fbf8d643d5e58a590 | |
parent | c6ae13af2093a744069c57c2344b679e48a6caae (diff) |
[OPENMP, NVPTX] Fix loop boundaries calculation for dynamic loops.
Summary:
Patch fixes the next problems.
1. Removes unused functions from omptarget_nvptx_ThreadPrivateContext
class + simplified data members.
2. Fixed calculation of loop boundaries for dynamic loops with static
scheduling.
3. Introduced saving/restoring of the dynamic loop boundaries to support
several nested parallel dynamic loops.
Reviewers: grokos
Subscribers: guansong, kkwli0, openmp-commits
Differential Revision: https://reviews.llvm.org/D49241
git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@336915 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/loop.cu | 13 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h | 26 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h | 24 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/parallel.cu | 3 |
4 files changed, 50 insertions, 16 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu index 1aab53e..60818af 100644 --- a/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -298,7 +298,9 @@ public: // compute static chunk ST stride; int lastiter = 0; - ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + ForStaticChunk( + lastiter, lb, ub, stride, chunk, + GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum); // save computed params omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; @@ -320,7 +322,9 @@ public: // compute static chunk ST stride; int lastiter = 0; - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + ForStaticNoChunk( + lastiter, lb, ub, stride, chunk, + GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum); // save computed params omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; @@ -366,10 +370,11 @@ public: // Support for dispatch next INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize, - Counter &loopLowerBound, + int64_t &loopLowerBound, T loopUpperBound) { // calculate lower bound for all lanes in the warp - lb = atomicAdd(&loopLowerBound, (Counter)chunkSize); + lb = atomicAdd((unsigned long long *)&loopLowerBound, + (unsigned long long)chunkSize); ub = lb + chunkSize - 1; // Clang uses i <= ub // 3 result cases: diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index 2bc5819..b8b6975 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -192,6 +192,8 @@ public: INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum); + INLINE void SaveLoopData(); + INLINE void RestoreLoopData() const; private: // bits for flags: (7 used, 1 free) @@ -207,6 +209,14 @@ private: static const uint8_t TaskDescr_IsParConstr = 0x20; static const uint8_t TaskDescr_InParL2P = 0x40; + struct SavedLoopDescr_items { + int64_t loopUpperBound; + int64_t nextLowerBound; + int64_t chunk; + int64_t stride; + kmp_sched_t schedule; + } loopData; + struct TaskDescr_items { uint8_t flags; // 6 bit used (see flag above) uint8_t unused; @@ -335,16 +345,8 @@ public: INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } INLINE int64_t &Chunk(int tid) { return chunk[tid]; } INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } - // state for dispatch with dyn/guided - INLINE Counter &CurrentEvent(int tid) { - return currEvent_or_nextLowerBound[tid]; - } - INLINE Counter &EventsNumber(int tid) { return eventsNum_or_stride[tid]; } - // state for dispatch with static - INLINE Counter &NextLowerBound(int tid) { - return currEvent_or_nextLowerBound[tid]; - } - INLINE Counter &Stride(int tid) { return eventsNum_or_stride[tid]; } + INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; } + INLINE int64_t &Stride(int tid) { return stride[tid]; } INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } @@ -373,8 +375,8 @@ private: int64_t chunk[MAX_THREADS_PER_TEAM]; int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; // state for dispatch with dyn/guided OR static (never use both at a time) - Counter currEvent_or_nextLowerBound[MAX_THREADS_PER_TEAM]; - Counter eventsNum_or_stride[MAX_THREADS_PER_TEAM]; + int64_t nextLowerBound[MAX_THREADS_PER_TEAM]; + int64_t stride[MAX_THREADS_PER_TEAM]; // Queue to which this object must be returned. uint64_t SourceQueue; }; diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h index 435a034..086f4c5 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h @@ -125,6 +125,30 @@ INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent( items.threadId = tid; } +INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() { + loopData.loopUpperBound = + omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId); + loopData.nextLowerBound = + omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId); + loopData.schedule = + omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId); + loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId); + loopData.stride = + omptarget_nvptx_threadPrivateContext->Stride(items.threadId); +} + +INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const { + omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk; + omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) = + loopData.loopUpperBound; + omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) = + loopData.nextLowerBound; + omptarget_nvptx_threadPrivateContext->Stride(items.threadId) = + loopData.stride; + omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) = + loopData.schedule; +} + //////////////////////////////////////////////////////////////////////////////// // Thread Private Context //////////////////////////////////////////////////////////////////////////////// diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu index d454628..33509b6 100644 --- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -386,6 +386,7 @@ EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) { // get current task omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + currTaskDescr->SaveLoopData(); // allocate new task descriptor and copy value from current one, set prev to // it @@ -417,6 +418,8 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, threadId, currTaskDescr->GetPrevTaskDescr()); // free SafeFree(currTaskDescr, (char *)"new seq parallel task"); + currTaskDescr = getMyTopTaskDescriptor(threadId); + currTaskDescr->RestoreLoopData(); } EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) { |