aboutsummaryrefslogtreecommitdiff
path: root/final/libomptarget/deviceRTLs/nvptx/src/loop.cu
diff options
context:
space:
mode:
Diffstat (limited to 'final/libomptarget/deviceRTLs/nvptx/src/loop.cu')
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/loop.cu769
1 files changed, 769 insertions, 0 deletions
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/loop.cu b/final/libomptarget/deviceRTLs/nvptx/src/loop.cu
new file mode 100644
index 0000000..f3e475d
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -0,0 +1,769 @@
+//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the KMPC interface
+// for the loop construct plus other worksharing constructs that use the same
+// interface as loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// template class that encapsulate all the helper functions
+//
+// T is loop iteration type (32 | 64) (unsigned | signed)
+// ST is the signed version of T
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
+public:
+ ////////////////////////////////////////////////////////////////////////////////
+ // Loop with static scheduling with chunk
+
+ // Generic implementation of OMP loop scheduling with static policy
+ /*! \brief Calculate initial bounds for static loop and stride
+ * @param[in] loc location in code of the call (not used here)
+ * @param[in] global_tid global thread id
+ * @param[in] schetype type of scheduling (see omptarget-nvptx.h)
+ * @param[in] plastiter pointer to last iteration
+ * @param[in,out] pointer to loop lower bound. it will contain value of
+ * lower bound of first chunk
+ * @param[in,out] pointer to loop upper bound. It will contain value of
+ * upper bound of first chunk
+ * @param[in,out] pointer to loop stride. It will contain value of stride
+ * between two successive chunks executed by the same thread
+ * @param[in] loop increment bump
+ * @param[in] chunk size
+ */
+
+ // helper function for static chunk
+ INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
+ ST chunk, T entityId, T numberOfEntities) {
+ // each thread executes multiple chunks all of the same size, except
+ // the last one
+
+ // distance between two successive chunks
+ stride = numberOfEntities * chunk;
+ lb = lb + entityId * chunk;
+ T inputUb = ub;
+ ub = lb + chunk - 1; // Clang uses i <= ub
+ // Say ub' is the begining of the last chunk. Then who ever has a
+ // lower bound plus a multiple of the increment equal to ub' is
+ // the last one.
+ T beginingLastChunk = inputUb - (inputUb % chunk);
+ last = ((beginingLastChunk - lb) % stride) == 0;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Loop with static scheduling without chunk
+
+ // helper function for static no chunk
+ INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
+ ST &chunk, T entityId,
+ T numberOfEntities) {
+ // No chunk size specified. Each thread or warp gets at most one
+ // chunk; chunks are all almost of equal size
+ T loopSize = ub - lb + 1;
+
+ chunk = loopSize / numberOfEntities;
+ T leftOver = loopSize - chunk * numberOfEntities;
+
+ if (entityId < leftOver) {
+ chunk++;
+ lb = lb + entityId * chunk;
+ } else {
+ lb = lb + entityId * chunk + leftOver;
+ }
+
+ T inputUb = ub;
+ ub = lb + chunk - 1; // Clang uses i <= ub
+ last = lb <= inputUb && inputUb <= ub;
+ stride = loopSize; // make sure we only do 1 chunk per warp
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Support for Static Init
+
+ INLINE static void for_static_init(int32_t schedtype, int32_t *plastiter,
+ T *plower, T *pupper, ST *pstride,
+ ST chunk, bool IsSPMDExecutionMode,
+ bool IsRuntimeUninitialized) {
+ // When IsRuntimeUninitialized is true, we assume that the caller is
+ // in an L0 parallel region and that all worker threads participate.
+
+ int tid = GetLogicalThreadIdInBlock();
+
+ // Assume we are in teams region or that we use a single block
+ // per target region
+ ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(
+ tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
+
+ // All warps that are in excess of the maximum requested, do
+ // not execute the loop
+ PRINT(LD_LOOP,
+ "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
+ "%d, num tids %d\n",
+ GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
+ schedtype, P64(chunk),
+ GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
+ GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized));
+ ASSERT0(
+ LT_FUSSY,
+ (GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized)) <
+ (GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized)),
+ "current thread is not needed here; error");
+
+ // copy
+ int lastiter = 0;
+ T lb = *plower;
+ T ub = *pupper;
+ ST stride = *pstride;
+ T entityId, numberOfEntities;
+ // init
+ switch (schedtype) {
+ case kmp_sched_static_chunk: {
+ if (chunk > 0) {
+ entityId =
+ GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
+ numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized);
+ ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
+ numberOfEntities);
+ break;
+ }
+ } // note: if chunk <=0, use nochunk
+ case kmp_sched_static_nochunk: {
+ entityId =
+ GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
+ numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized);
+ ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId,
+ numberOfEntities);
+ break;
+ }
+ case kmp_sched_distr_static_chunk: {
+ if (chunk > 0) {
+ entityId = GetOmpTeamId();
+ numberOfEntities = GetNumberOfOmpTeams();
+ ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
+ numberOfEntities);
+ break;
+ } // note: if chunk <=0, use nochunk
+ }
+ case kmp_sched_distr_static_nochunk: {
+ entityId = GetOmpTeamId();
+ numberOfEntities = GetNumberOfOmpTeams();
+
+ ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId,
+ numberOfEntities);
+ break;
+ }
+ case kmp_sched_distr_static_chunk_sched_static_chunkone: {
+ entityId =
+ GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized) *
+ GetOmpTeamId() +
+ GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
+ numberOfEntities = GetNumberOfOmpTeams() *
+ GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized);
+ ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
+ numberOfEntities);
+ break;
+ }
+ default: {
+ ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", schedtype);
+ PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
+ schedtype);
+ entityId =
+ GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
+ numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized);
+ ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
+ numberOfEntities);
+ }
+ }
+ // copy back
+ *plastiter = lastiter;
+ *plower = lb;
+ *pupper = ub;
+ *pstride = stride;
+ PRINT(LD_LOOP,
+ "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld\n",
+ GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized),
+ GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper),
+ P64(*pstride));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Support for dispatch Init
+
+ INLINE static int OrderedSchedule(kmp_sched_t schedule) {
+ return schedule >= kmp_sched_ordered_first &&
+ schedule <= kmp_sched_ordered_last;
+ }
+
+ INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId,
+ kmp_sched_t schedule, T lb, T ub, ST st,
+ ST chunk) {
+ int tid = GetLogicalThreadIdInBlock();
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
+ T tnum = currTaskDescr->ThreadsInTeam();
+ T tripCount = ub - lb + 1; // +1 because ub is inclusive
+ ASSERT0(
+ LT_FUSSY,
+ GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) <
+ GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+ "current thread is not needed here; error");
+
+ /* Currently just ignore the monotonic and non-monotonic modifiers
+ * (the compiler isn't producing them * yet anyway).
+ * When it is we'll want to look at them somewhere here and use that
+ * information to add to our schedule choice. We shouldn't need to pass
+ * them on, they merely affect which schedule we can legally choose for
+ * various dynamic cases. (In paritcular, whether or not a stealing scheme
+ * is legal).
+ */
+ schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+
+ // Process schedule.
+ if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
+ if (OrderedSchedule(schedule))
+ __kmpc_barrier(loc, threadId);
+ PRINT(LD_LOOP,
+ "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
+ (long)tnum, P64(tripCount), schedule);
+ schedule = kmp_sched_static_chunk;
+ chunk = tripCount; // one thread gets the whole loop
+ } else if (schedule == kmp_sched_runtime) {
+ // process runtime
+ omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
+ chunk = currTaskDescr->RuntimeChunkSize();
+ switch (rtSched) {
+ case omp_sched_static: {
+ if (chunk > 0)
+ schedule = kmp_sched_static_chunk;
+ else
+ schedule = kmp_sched_static_nochunk;
+ break;
+ }
+ case omp_sched_auto: {
+ schedule = kmp_sched_static_chunk;
+ chunk = 1;
+ break;
+ }
+ case omp_sched_dynamic:
+ case omp_sched_guided: {
+ schedule = kmp_sched_dynamic;
+ break;
+ }
+ }
+ PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", schedule,
+ P64(chunk));
+ } else if (schedule == kmp_sched_auto) {
+ schedule = kmp_sched_static_chunk;
+ chunk = 1;
+ PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", schedule,
+ P64(chunk));
+ } else {
+ PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", schedule, P64(chunk));
+ ASSERT(LT_FUSSY,
+ schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+ "unknown schedule %d & chunk %lld\n", schedule, P64(chunk));
+ }
+
+ // init schedules
+ if (schedule == kmp_sched_static_chunk) {
+ ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+ // save sched state
+ omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+ // save ub
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+ // compute static chunk
+ ST stride;
+ int lastiter = 0;
+ ForStaticChunk(
+ lastiter, lb, ub, stride, chunk,
+ GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum);
+ // save computed params
+ omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+ omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+ PRINT(LD_LOOP,
+ "dispatch init (static chunk) : num threads = %d, ub = %" PRId64
+ ", next lower bound = %llu, stride = %llu\n",
+ GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+ omptarget_nvptx_threadPrivateContext->Stride(tid));
+
+ } else if (schedule == kmp_sched_static_nochunk) {
+ ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
+ // save sched state
+ omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+ // save ub
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+ // compute static chunk
+ ST stride;
+ int lastiter = 0;
+ ForStaticNoChunk(
+ lastiter, lb, ub, stride, chunk,
+ GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum);
+ // save computed params
+ omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+ omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+ PRINT(LD_LOOP,
+ "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
+ ", next lower bound = %llu, stride = %llu\n",
+ GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+ omptarget_nvptx_threadPrivateContext->Stride(tid));
+
+ } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
+ __kmpc_barrier(loc, threadId);
+ // save sched state
+ int teamId = GetOmpTeamId();
+ omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+ if (GetThreadIdInBlock() == 0) {
+ if (chunk < 1)
+ chunk = 1;
+ omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
+ }
+ __kmpc_barrier(loc, threadId);
+ PRINT(LD_LOOP,
+ "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
+ ", chunk %" PRIu64 "\n",
+ GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
+ omptarget_nvptx_threadPrivateContext->Chunk(teamId));
+ }
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Support for dispatch next
+
+ INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
+ int64_t &loopLowerBound,
+ T loopUpperBound) {
+ // calculate lower bound for all lanes in the warp
+ lb = atomicAdd((unsigned long long *)&loopLowerBound,
+ (unsigned long long)chunkSize);
+ ub = lb + chunkSize - 1; // Clang uses i <= ub
+
+ // 3 result cases:
+ // a. lb and ub < loopUpperBound --> NOT_FINISHED
+ // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
+ // NOT_FINISHED
+ // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
+ // a.
+ if (lb <= loopUpperBound && ub < loopUpperBound) {
+ PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb),
+ P64(ub), P64(loopUpperBound));
+ return NOT_FINISHED;
+ }
+ // b.
+ if (lb <= loopUpperBound) {
+ PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
+ P64(lb), P64(ub), P64(loopUpperBound));
+ ub = loopUpperBound;
+ return LAST_CHUNK;
+ }
+ // c. if we are here, we are in case 'c'
+ lb = loopUpperBound + 2;
+ ub = loopUpperBound + 1;
+ PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb),
+ P64(ub), P64(loopUpperBound));
+ return FINISHED;
+ }
+
+ // On Pascal, with inlining of the runtime into the user application,
+ // this code deadlocks. This is probably because different threads
+ // in a warp cannot make independent progress.
+ NOINLINE static int dispatch_next(int32_t *plast, T *plower, T *pupper,
+ ST *pstride) {
+ // ID of a thread in its own warp
+
+ // automatically selects thread or warp ID based on selected implementation
+ int tid = GetLogicalThreadIdInBlock();
+ ASSERT0(
+ LT_FUSSY,
+ GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) <
+ GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+ "current thread is not needed here; error");
+ // retrieve schedule
+ kmp_sched_t schedule =
+ omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
+
+ // xxx reduce to one
+ if (schedule == kmp_sched_static_chunk ||
+ schedule == kmp_sched_static_nochunk) {
+ T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
+ T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
+ // finished?
+ if (myLb > ub) {
+ PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
+ P64(myLb), P64(ub));
+ return DISPATCH_FINISHED;
+ }
+ // not finished, save current bounds
+ ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
+ *plower = myLb;
+ T myUb = myLb + chunk - 1; // Clang uses i <= ub
+ if (myUb > ub)
+ myUb = ub;
+ *pupper = myUb;
+ *plast = (int32_t)(myUb == ub);
+
+ // increment next lower bound by the stride
+ ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
+ PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
+ P64(*plower), P64(*pupper));
+ return DISPATCH_NOTFINISHED;
+ }
+ ASSERT0(LT_FUSSY,
+ schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+ "bad sched");
+ T myLb, myUb;
+ int teamId = GetOmpTeamId();
+ int finished = DynamicNextChunk(
+ myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId),
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId));
+
+ if (finished == FINISHED)
+ return DISPATCH_FINISHED;
+
+ // not finished (either not finished or last chunk)
+ *plast = (int32_t)(finished == LAST_CHUNK);
+ *plower = myLb;
+ *pupper = myUb;
+ *pstride = 1;
+
+ PRINT(LD_LOOP,
+ "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld\n",
+ GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+ GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper),
+ P64(*pstride));
+ return DISPATCH_NOTFINISHED;
+ }
+
+ INLINE static void dispatch_fini() {
+ // nothing
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // end of template class that encapsulate all the helper functions
+ ////////////////////////////////////////////////////////////////////////////////
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (dyn loops)
+////////////////////////////////////////////////////////////////////////////////
+
+// init
+EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t tid,
+ int32_t schedule, int32_t lb, int32_t ub,
+ int32_t st, int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
+ omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
+ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid,
+ int32_t schedule, uint32_t lb, uint32_t ub,
+ int32_t st, int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
+ omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
+ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid,
+ int32_t schedule, int64_t lb, int64_t ub,
+ int64_t st, int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
+ omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
+ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid,
+ int32_t schedule, uint64_t lb, uint64_t ub,
+ int64_t st, int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
+ omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
+ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+// next
+EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t tid, int32_t *p_last,
+ int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
+ PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
+ return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
+ p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_4u(kmp_Indent *loc, int32_t tid,
+ int32_t *p_last, uint32_t *p_lb,
+ uint32_t *p_ub, int32_t *p_st) {
+ PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
+ return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
+ p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_8(kmp_Indent *loc, int32_t tid, int32_t *p_last,
+ int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
+ PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
+ return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
+ p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_8u(kmp_Indent *loc, int32_t tid,
+ int32_t *p_last, uint64_t *p_lb,
+ uint64_t *p_ub, int64_t *p_st) {
+ PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
+ return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
+ p_last, p_lb, p_ub, p_st);
+}
+
+// fini
+EXTERN void __kmpc_dispatch_fini_4(kmp_Indent *loc, int32_t tid) {
+ PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
+ omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_4u(kmp_Indent *loc, int32_t tid) {
+ PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
+ omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_8(kmp_Indent *loc, int32_t tid) {
+ PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
+ omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_8u(kmp_Indent *loc, int32_t tid) {
+ PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
+ omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (static loops)
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ int32_t *plower, int32_t *pupper,
+ int32_t *pstride, int32_t incr,
+ int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
+ omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
+ isRuntimeUninitialized());
+}
+
+EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ uint32_t *plower, uint32_t *pupper,
+ int32_t *pstride, int32_t incr,
+ int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
+ omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
+ isRuntimeUninitialized());
+}
+
+EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ int64_t *plower, int64_t *pupper,
+ int64_t *pstride, int64_t incr,
+ int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
+ omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
+ isRuntimeUninitialized());
+}
+
+EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ uint64_t *plower, uint64_t *pupper,
+ int64_t *pstride, int64_t incr,
+ int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
+ omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
+ isRuntimeUninitialized());
+}
+
+EXTERN
+void __kmpc_for_static_init_4_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ int32_t *plower, int32_t *pupper,
+ int32_t *pstride, int32_t incr,
+ int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
+ omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/true,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype,
+ int32_t *plastiter, uint32_t *plower,
+ uint32_t *pupper, int32_t *pstride,
+ int32_t incr, int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
+ omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/true,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ int64_t *plower, int64_t *pupper,
+ int64_t *pstride, int64_t incr,
+ int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
+ omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/true,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype,
+ int32_t *plastiter, uint64_t *plower,
+ uint64_t *pupper, int64_t *pstride,
+ int64_t incr, int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
+ omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/true,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4_simple_generic(
+ kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+ int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr,
+ int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
+ omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/false,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4u_simple_generic(
+ kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+ uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
+ int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
+ omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/false,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8_simple_generic(
+ kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+ int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr,
+ int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
+ omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/false,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8u_simple_generic(
+ kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+ uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
+ int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
+ omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/false,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid) {
+ PRINT0(LD_IO, "call kmpc_for_static_fini\n");
+}
+
+namespace {
+INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
+ int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+ // On Volta and newer architectures we require that all lanes in
+ // a warp (at least, all present for the kernel launch) participate in the
+ // barrier. This is enforced when launching the parallel region. An
+ // exception is when there are < WARPSIZE workers. In this case only 1 worker
+ // is started, so we don't need a barrier.
+ if (NumThreads > 1) {
+#endif
+ named_sync(L1_BARRIER, WARPSIZE * NumWarps);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+ }
+#endif
+}
+}; // namespace
+
+EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Indent *loc, int32_t gtid,
+ int32_t varNum, void *array) {
+ PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
+
+ omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
+ int tid = GetOmpThreadId(GetLogicalThreadIdInBlock(), isSPMDMode(),
+ isRuntimeUninitialized());
+ uint32_t NumThreads = GetNumberOfOmpThreads(
+ GetLogicalThreadIdInBlock(), isSPMDMode(), isRuntimeUninitialized());
+ uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
+ for (unsigned i = 0; i < varNum; i++) {
+ // Reset buffer.
+ if (tid == 0)
+ *Buffer = 0; // Reset to minimum loop iteration value.
+
+ // Barrier.
+ syncWorkersInGenericMode(NumThreads);
+
+ // Atomic max of iterations.
+ uint64_t *varArray = (uint64_t *)array;
+ uint64_t elem = varArray[i];
+ (void)atomicMax((unsigned long long int *)Buffer,
+ (unsigned long long int)elem);
+
+ // Barrier.
+ syncWorkersInGenericMode(NumThreads);
+
+ // Read max value and update thread private array.
+ varArray[i] = *Buffer;
+
+ // Barrier.
+ syncWorkersInGenericMode(NumThreads);
+ }
+}