1 files changed, 806 insertions, 0 deletions
diff --git a/rc3/libomptarget/deviceRTLs/nvptx/src/loop.cu b/rc3/libomptarget/deviceRTLs/nvptx/src/loop.cu
new file mode 100644
index 0000000..998ce54
--- /dev/null
+++ b/rc3/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -0,0 +1,806 @@
+//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the KMPC interface
+// for the loop construct plus other worksharing constructs that use the same
+// interface as loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// template class that encapsulate all the helper functions
+//
+// T is loop iteration type (32 | 64)  (unsigned | signed)
+// ST is the signed version of T
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
+public:
+  ////////////////////////////////////////////////////////////////////////////////
+  // Loop with static scheduling with chunk
+
+  // Generic implementation of OMP loop scheduling with static policy
+  /*! \brief Calculate initial bounds for static loop and stride
+   *  @param[in] loc location in code of the call (not used here)
+   *  @param[in] global_tid global thread id
+   *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
+   *  @param[in] plastiter pointer to last iteration
+   *  @param[in,out] pointer to loop lower bound. it will contain value of
+   *  lower bound of first chunk
+   *  @param[in,out] pointer to loop upper bound. It will contain value of
+   *  upper bound of first chunk
+   *  @param[in,out] pointer to loop stride. It will contain value of stride
+   *  between two successive chunks executed by the same thread
+   *  @param[in] loop increment bump
+   *  @param[in] chunk size
+   */
+
+  // helper function for static chunk
+  INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
+                                    ST chunk, T entityId, T numberOfEntities) {
+    // each thread executes multiple chunks all of the same size, except
+    // the last one
+
+    // distance between two successive chunks
+    stride = numberOfEntities * chunk;
+    lb = lb + entityId * chunk;
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    // Say ub' is the begining of the last chunk. Then who ever has a
+    // lower bound plus a multiple of the increment equal to ub' is
+    // the last one.
+    T beginingLastChunk = inputUb - (inputUb % chunk);
+    last = ((beginingLastChunk - lb) % stride) == 0;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Loop with static scheduling without chunk
+
+  // helper function for static no chunk
+  INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
+                                      ST &chunk, T entityId,
+                                      T numberOfEntities) {
+    // No chunk size specified.  Each thread or warp gets at most one
+    // chunk; chunks are all almost of equal size
+    T loopSize = ub - lb + 1;
+
+    chunk = loopSize / numberOfEntities;
+    T leftOver = loopSize - chunk * numberOfEntities;
+
+    if (entityId < leftOver) {
+      chunk++;
+      lb = lb + entityId * chunk;
+    } else {
+      lb = lb + entityId * chunk + leftOver;
+    }
+
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    last = lb <= inputUb && inputUb <= ub;
+    stride = loopSize; // make sure we only do 1 chunk per warp
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for Static Init
+
+  INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
+                                     int32_t *plastiter, T *plower, T *pupper,
+                                     ST *pstride, ST chunk,
+                                     bool IsSPMDExecutionMode,
+                                     bool IsRuntimeUninitialized) {
+    // When IsRuntimeUninitialized is true, we assume that the caller is
+    // in an L0 parallel region and that all worker threads participate.
+
+    int tid = GetLogicalThreadIdInBlock(IsSPMDExecutionMode);
+
+    // Assume we are in teams region or that we use a single block
+    // per target region
+    ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(
+        tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
+
+    // All warps that are in excess of the maximum requested, do
+    // not execute the loop
+    PRINT(LD_LOOP,
+          "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
+          "%d, num tids %d\n",
+          (int)gtid, (int)schedtype, (long long)chunk, (int)gtid,
+          (int)numberOfActiveOMPThreads);
+    ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
+            "current thread is not needed here; error");
+
+    // copy
+    int lastiter = 0;
+    T lb = *plower;
+    T ub = *pupper;
+    ST stride = *pstride;
+    // init
+    switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
+    case kmp_sched_static_chunk: {
+      if (chunk > 0) {
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+        break;
+      }
+    } // note: if chunk <=0, use nochunk
+    case kmp_sched_static_balanced_chunk: {
+      if (chunk > 0) {
+        // round up to make sure the chunk is enough to cover all iterations
+        T tripCount = ub - lb + 1; // +1 because ub is inclusive
+        T span = (tripCount + numberOfActiveOMPThreads - 1) /
+                 numberOfActiveOMPThreads;
+        // perform chunk adjustment
+        chunk = (span + chunk - 1) & ~(chunk - 1);
+
+        ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
+        T oldUb = ub;
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+        if (ub > oldUb)
+          ub = oldUb;
+        break;
+      }
+    } // note: if chunk <=0, use nochunk
+    case kmp_sched_static_nochunk: {
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+      break;
+    }
+    case kmp_sched_distr_static_chunk: {
+      if (chunk > 0) {
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
+                       GetNumberOfOmpTeams());
+        break;
+      } // note: if chunk <=0, use nochunk
+    }
+    case kmp_sched_distr_static_nochunk: {
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
+                       GetNumberOfOmpTeams());
+      break;
+    }
+    case kmp_sched_distr_static_chunk_sched_static_chunkone: {
+      ForStaticChunk(lastiter, lb, ub, stride, chunk,
+                     numberOfActiveOMPThreads * GetOmpTeamId() + gtid,
+                     GetNumberOfOmpTeams() * numberOfActiveOMPThreads);
+      break;
+    }
+    default: {
+      ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", (int)schedtype);
+      PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
+            (int)schedtype);
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                     numberOfActiveOMPThreads);
+      break;
+    }
+    }
+    // copy back
+    *plastiter = lastiter;
+    *plower = lb;
+    *pupper = ub;
+    *pstride = stride;
+    PRINT(LD_LOOP,
+          "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
+          "%d\n",
+          (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(),
+          (long long)(*plower), (long long)(*pupper), (long long)(*pstride),
+          (int)lastiter);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for dispatch Init
+
+  INLINE static int OrderedSchedule(kmp_sched_t schedule) {
+    return schedule >= kmp_sched_ordered_first &&
+           schedule <= kmp_sched_ordered_last;
+  }
+
+  INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId,
+                                   kmp_sched_t schedule, T lb, T ub, ST st,
+                                   ST chunk) {
+    ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
+            "Expected non-SPMD mode + initialized runtime.");
+    int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
+    T tnum = currTaskDescr->ThreadsInTeam();
+    T tripCount = ub - lb + 1; // +1 because ub is inclusive
+    ASSERT0(LT_FUSSY, threadId < tnum,
+            "current thread is not needed here; error");
+
+    /* Currently just ignore the monotonic and non-monotonic modifiers
+     * (the compiler isn't producing them * yet anyway).
+     * When it is we'll want to look at them somewhere here and use that
+     * information to add to our schedule choice. We shouldn't need to pass
+     * them on, they merely affect which schedule we can legally choose for
+     * various dynamic cases. (In paritcular, whether or not a stealing scheme
+     * is legal).
+     */
+    schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+
+    // Process schedule.
+    if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
+      if (OrderedSchedule(schedule))
+        __kmpc_barrier(loc, threadId);
+      PRINT(LD_LOOP,
+            "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
+            (long)tnum, (long long)tripCount, (int)schedule);
+      schedule = kmp_sched_static_chunk;
+      chunk = tripCount; // one thread gets the whole loop
+    } else if (schedule == kmp_sched_runtime) {
+      // process runtime
+      omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
+      chunk = currTaskDescr->RuntimeChunkSize();
+      switch (rtSched) {
+      case omp_sched_static: {
+        if (chunk > 0)
+          schedule = kmp_sched_static_chunk;
+        else
+          schedule = kmp_sched_static_nochunk;
+        break;
+      }
+      case omp_sched_auto: {
+        schedule = kmp_sched_static_chunk;
+        chunk = 1;
+        break;
+      }
+      case omp_sched_dynamic:
+      case omp_sched_guided: {
+        schedule = kmp_sched_dynamic;
+        break;
+      }
+      }
+      PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule,
+            (long long)chunk);
+    } else if (schedule == kmp_sched_auto) {
+      schedule = kmp_sched_static_chunk;
+      chunk = 1;
+      PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule,
+            (long long)chunk);
+    } else {
+      PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule,
+            (long long)chunk);
+      ASSERT(LT_FUSSY,
+             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+             "unknown schedule %d & chunk %lld\n", (int)schedule,
+             (long long)chunk);
+    }
+
+    // init schedules
+    if (schedule == kmp_sched_static_chunk) {
+      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      // save ub
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      // save computed params
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+      PRINT(LD_LOOP,
+            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
+            (int)tnum,
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
+                tid));
+    } else if (schedule == kmp_sched_static_balanced_chunk) {
+      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      // save ub
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      // round up to make sure the chunk is enough to cover all iterations
+      T span = (tripCount + tnum - 1) / tnum;
+      // perform chunk adjustment
+      chunk = (span + chunk - 1) & ~(chunk - 1);
+
+      T oldUb = ub;
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
+      if (ub > oldUb)
+        ub = oldUb;
+      // save computed params
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+      PRINT(LD_LOOP,
+            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
+            (int)tnum,
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
+                tid));
+    } else if (schedule == kmp_sched_static_nochunk) {
+      ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      // save ub
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      // save computed params
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+      PRINT(LD_LOOP,
+            "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
+            (int)tnum,
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
+                tid));
+    } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
+      // save data
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      if (chunk < 1)
+        chunk = 1;
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      __kmpc_barrier(loc, threadId);
+      if (tid == 0) {
+        omptarget_nvptx_threadPrivateContext->Cnt() = 0;
+        __threadfence_block();
+      }
+      __kmpc_barrier(loc, threadId);
+      PRINT(LD_LOOP,
+            "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
+            ", chunk %" PRIu64 "\n",
+            (int)tnum,
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            omptarget_nvptx_threadPrivateContext->Chunk(tid));
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for dispatch next
+
+  INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
+    int lo, hi;
+    asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+    hi = __SHFL_SYNC(active, hi, leader);
+    lo = __SHFL_SYNC(active, lo, leader);
+    asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
+    return val;
+  }
+
+  INLINE static uint64_t NextIter() {
+    unsigned int active = __ACTIVEMASK();
+    int leader = __ffs(active) - 1;
+    int change = __popc(active);
+    unsigned lane_mask_lt;
+    asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));
+    unsigned int rank = __popc(active & lane_mask_lt);
+    uint64_t warp_res;
+    if (rank == 0) {
+      warp_res = atomicAdd(
+          (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
+          change);
+    }
+    warp_res = Shuffle(active, warp_res, leader);
+    return warp_res + rank;
+  }
+
+  INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
+                                     T loopLowerBound, T loopUpperBound) {
+    T N = NextIter();
+    lb = loopLowerBound + N * chunkSize;
+    ub = lb + chunkSize - 1;  // Clang uses i <= ub
+
+    // 3 result cases:
+    //  a. lb and ub < loopUpperBound --> NOT_FINISHED
+    //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
+    //  NOT_FINISHED
+    //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
+    // a.
+    if (lb <= loopUpperBound && ub < loopUpperBound) {
+      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
+            (long long)lb, (long long)ub, (long long)loopUpperBound);
+      return NOT_FINISHED;
+    }
+    // b.
+    if (lb <= loopUpperBound) {
+      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
+            (long long)lb, (long long)ub, (long long)loopUpperBound);
+      ub = loopUpperBound;
+      return LAST_CHUNK;
+    }
+    // c. if we are here, we are in case 'c'
+    lb = loopUpperBound + 2;
+    ub = loopUpperBound + 1;
+    PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb,
+          (long long)ub, (long long)loopUpperBound);
+    return FINISHED;
+  }
+
+  INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast,
+                                  T *plower, T *pupper, ST *pstride) {
+    ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
+            "Expected non-SPMD mode + initialized runtime.");
+    // ID of a thread in its own warp
+
+    // automatically selects thread or warp ID based on selected implementation
+    int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+    ASSERT0(LT_FUSSY,
+            gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
+                                         checkRuntimeUninitialized(loc)),
+            "current thread is not needed here; error");
+    // retrieve schedule
+    kmp_sched_t schedule =
+        omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
+
+    // xxx reduce to one
+    if (schedule == kmp_sched_static_chunk ||
+        schedule == kmp_sched_static_nochunk) {
+      T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
+      T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
+      // finished?
+      if (myLb > ub) {
+        PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
+              (long long)myLb, (long long)ub);
+        return DISPATCH_FINISHED;
+      }
+      // not finished, save current bounds
+      ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
+      *plower = myLb;
+      T myUb = myLb + chunk - 1; // Clang uses i <= ub
+      if (myUb > ub)
+        myUb = ub;
+      *pupper = myUb;
+      *plast = (int32_t)(myUb == ub);
+
+      // increment next lower bound by the stride
+      ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
+      PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
+            (long long)*plower, (long long)*pupper);
+      return DISPATCH_NOTFINISHED;
+    }
+    ASSERT0(LT_FUSSY,
+            schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+            "bad sched");
+    T myLb, myUb;
+    int finished = DynamicNextChunk(
+        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
+        omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+        omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
+
+    if (finished == FINISHED)
+      return DISPATCH_FINISHED;
+
+    // not finished (either not finished or last chunk)
+    *plast = (int32_t)(finished == LAST_CHUNK);
+    *plower = myLb;
+    *pupper = myUb;
+    *pstride = 1;
+
+    PRINT(
+        LD_LOOP,
+        "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
+        "last %d\n",
+        (int)GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+        (int)GetNumberOfWorkersInTeam(), (long long)*plower, (long long)*pupper,
+        (long long)*pstride, (int)*plast);
+    return DISPATCH_NOTFINISHED;
+  }
+
+  INLINE static void dispatch_fini() {
+    // nothing
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // end of template class that encapsulate all the helper functions
+  ////////////////////////////////////////////////////////////////////////////////
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (dyn loops)
+////////////////////////////////////////////////////////////////////////////////
+
+// init
+EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid,
+                                   int32_t schedule, int32_t lb, int32_t ub,
+                                   int32_t st, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid,
+                                    int32_t schedule, uint32_t lb, uint32_t ub,
+                                    int32_t st, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid,
+                                   int32_t schedule, int64_t lb, int64_t ub,
+                                   int64_t st, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid,
+                                    int32_t schedule, uint64_t lb, uint64_t ub,
+                                    int64_t st, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+// next
+EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last,
+                                  int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
+  return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid,
+                                   int32_t *p_last, uint32_t *p_lb,
+                                   uint32_t *p_ub, int32_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
+  return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
+                                  int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
+  return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid,
+                                   int32_t *p_last, uint64_t *p_lb,
+                                   uint64_t *p_ub, int64_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
+  return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st);
+}
+
+// fini
+EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (static loops)
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
+                                     int32_t schedtype, int32_t *plastiter,
+                                     int32_t *plower, int32_t *pupper,
+                                     int32_t *pstride, int32_t incr,
+                                     int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+}
+
+EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
+                                      int32_t schedtype, int32_t *plastiter,
+                                      uint32_t *plower, uint32_t *pupper,
+                                      int32_t *pstride, int32_t incr,
+                                      int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+}
+
+EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
+                                     int32_t schedtype, int32_t *plastiter,
+                                     int64_t *plower, int64_t *pupper,
+                                     int64_t *pstride, int64_t incr,
+                                     int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+}
+
+EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
+                                      int32_t schedtype, int32_t *plastiter,
+                                      uint64_t *plower, uint64_t *pupper,
+                                      int64_t *pstride, int64_t incr,
+                                      int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+}
+
+EXTERN
+void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                          int32_t schedtype, int32_t *plastiter,
+                                          int32_t *plower, int32_t *pupper,
+                                          int32_t *pstride, int32_t incr,
+                                          int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                           int32_t schedtype,
+                                           int32_t *plastiter, uint32_t *plower,
+                                           uint32_t *pupper, int32_t *pstride,
+                                           int32_t incr, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                          int32_t schedtype, int32_t *plastiter,
+                                          int64_t *plower, int64_t *pupper,
+                                          int64_t *pstride, int64_t incr,
+                                          int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                           int32_t schedtype,
+                                           int32_t *plastiter, uint64_t *plower,
+                                           uint64_t *pupper, int64_t *pstride,
+                                           int64_t incr, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr,
+    int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4u_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
+    int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr,
+    int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8u_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
+    int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_for_static_fini\n");
+}
+
+namespace {
+INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
+  int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  // On Volta and newer architectures we require that all lanes in
+  // a warp (at least, all present for the kernel launch) participate in the
+  // barrier.  This is enforced when launching the parallel region.  An
+  // exception is when there are < WARPSIZE workers.  In this case only 1 worker
+  // is started, so we don't need a barrier.
+  if (NumThreads > 1) {
+#endif
+    named_sync(L1_BARRIER, WARPSIZE * NumWarps);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  }
+#endif
+}
+}; // namespace
+
+EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
+                                                  int32_t varNum, void *array) {
+  PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
+  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
+          "Expected non-SPMD mode + initialized runtime.");
+
+  omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
+  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+  uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
+                                              checkRuntimeUninitialized(loc));
+  uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
+  for (unsigned i = 0; i < varNum; i++) {
+    // Reset buffer.
+    if (gtid == 0)
+      *Buffer = 0; // Reset to minimum loop iteration value.
+
+    // Barrier.
+    syncWorkersInGenericMode(NumThreads);
+
+    // Atomic max of iterations.
+    uint64_t *varArray = (uint64_t *)array;
+    uint64_t elem = varArray[i];
+    (void)atomicMax((unsigned long long int *)Buffer,
+                    (unsigned long long int)elem);
+
+    // Barrier.
+    syncWorkersInGenericMode(NumThreads);
+
+    // Read max value and update thread private array.
+    varArray[i] = *Buffer;
+
+    // Barrier.
+    syncWorkersInGenericMode(NumThreads);
+  }
+}