aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2018-07-12 15:18:28 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2018-07-12 15:18:28 +0000
commit07d9ad356558ea7700cc1794848a7e327f9192ea (patch)
treec704bbf42ab229827f47d45fbf8d643d5e58a590
parentc6ae13af2093a744069c57c2344b679e48a6caae (diff)
[OPENMP, NVPTX] Fix loop boundaries calculation for dynamic loops.
Summary: Patch fixes the next problems. 1. Removes unused functions from omptarget_nvptx_ThreadPrivateContext class + simplified data members. 2. Fixed calculation of loop boundaries for dynamic loops with static scheduling. 3. Introduced saving/restoring of the dynamic loop boundaries to support several nested parallel dynamic loops. Reviewers: grokos Subscribers: guansong, kkwli0, openmp-commits Differential Revision: https://reviews.llvm.org/D49241 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@336915 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/loop.cu13
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h26
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h24
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/parallel.cu3
4 files changed, 50 insertions, 16 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu
index 1aab53e..60818af 100644
--- a/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -298,7 +298,9 @@ public:
// compute static chunk
ST stride;
int lastiter = 0;
- ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+ ForStaticChunk(
+ lastiter, lb, ub, stride, chunk,
+ GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum);
// save computed params
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
@@ -320,7 +322,9 @@ public:
// compute static chunk
ST stride;
int lastiter = 0;
- ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+ ForStaticNoChunk(
+ lastiter, lb, ub, stride, chunk,
+ GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum);
// save computed params
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
@@ -366,10 +370,11 @@ public:
// Support for dispatch next
INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
- Counter &loopLowerBound,
+ int64_t &loopLowerBound,
T loopUpperBound) {
// calculate lower bound for all lanes in the warp
- lb = atomicAdd(&loopLowerBound, (Counter)chunkSize);
+ lb = atomicAdd((unsigned long long *)&loopLowerBound,
+ (unsigned long long)chunkSize);
ub = lb + chunkSize - 1; // Clang uses i <= ub
// 3 result cases:
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 2bc5819..b8b6975 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -192,6 +192,8 @@ public:
INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
uint16_t tid, uint16_t tnum);
+ INLINE void SaveLoopData();
+ INLINE void RestoreLoopData() const;
private:
// bits for flags: (7 used, 1 free)
@@ -207,6 +209,14 @@ private:
static const uint8_t TaskDescr_IsParConstr = 0x20;
static const uint8_t TaskDescr_InParL2P = 0x40;
+ struct SavedLoopDescr_items {
+ int64_t loopUpperBound;
+ int64_t nextLowerBound;
+ int64_t chunk;
+ int64_t stride;
+ kmp_sched_t schedule;
+ } loopData;
+
struct TaskDescr_items {
uint8_t flags; // 6 bit used (see flag above)
uint8_t unused;
@@ -335,16 +345,8 @@ public:
INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
- // state for dispatch with dyn/guided
- INLINE Counter &CurrentEvent(int tid) {
- return currEvent_or_nextLowerBound[tid];
- }
- INLINE Counter &EventsNumber(int tid) { return eventsNum_or_stride[tid]; }
- // state for dispatch with static
- INLINE Counter &NextLowerBound(int tid) {
- return currEvent_or_nextLowerBound[tid];
- }
- INLINE Counter &Stride(int tid) { return eventsNum_or_stride[tid]; }
+ INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
+ INLINE int64_t &Stride(int tid) { return stride[tid]; }
INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
@@ -373,8 +375,8 @@ private:
int64_t chunk[MAX_THREADS_PER_TEAM];
int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
// state for dispatch with dyn/guided OR static (never use both at a time)
- Counter currEvent_or_nextLowerBound[MAX_THREADS_PER_TEAM];
- Counter eventsNum_or_stride[MAX_THREADS_PER_TEAM];
+ int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
+ int64_t stride[MAX_THREADS_PER_TEAM];
// Queue to which this object must be returned.
uint64_t SourceQueue;
};
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
index 435a034..086f4c5 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
@@ -125,6 +125,30 @@ INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
items.threadId = tid;
}
+INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
+ loopData.loopUpperBound =
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
+ loopData.nextLowerBound =
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
+ loopData.schedule =
+ omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
+ loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
+ loopData.stride =
+ omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
+}
+
+INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
+ omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
+ loopData.loopUpperBound;
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
+ loopData.nextLowerBound;
+ omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
+ loopData.stride;
+ omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
+ loopData.schedule;
+}
+
////////////////////////////////////////////////////////////////////////////////
// Thread Private Context
////////////////////////////////////////////////////////////////////////////////
diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index d454628..33509b6 100644
--- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -386,6 +386,7 @@ EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) {
// get current task
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
+ currTaskDescr->SaveLoopData();
// allocate new task descriptor and copy value from current one, set prev to
// it
@@ -417,6 +418,8 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc,
threadId, currTaskDescr->GetPrevTaskDescr());
// free
SafeFree(currTaskDescr, (char *)"new seq parallel task");
+ currTaskDescr = getMyTopTaskDescriptor(threadId);
+ currTaskDescr->RestoreLoopData();
}
EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) {