1 files changed, 226 insertions, 0 deletions
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
new file mode 100644
index 0000000..e4efa18
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
@@ -0,0 +1,226 @@
+//===---- omptarget-nvptxi.h - NVPTX OpenMP GPU initialization --- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of all library macros, types,
+// and functions.
+//
+//===----------------------------------------------------------------------===//
+
+////////////////////////////////////////////////////////////////////////////////
+// Task Descriptor
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const {
+  // sched starts from 1..4; encode it as 0..3; so add 1 here
+  uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
+  return (omp_sched_t)rc;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
+  // sched starts from 1..4; encode it as 0..3; so sub 1 here
+  uint8_t val = ((uint8_t)sched) - 1;
+  // clear current sched
+  items.flags &= ~TaskDescr_SchedMask;
+  // set new sched
+  items.flags |= val;
+}
+
+INLINE void
+omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
+  // slow method
+  // flag:
+  //   default sched is static,
+  //   dyn is off (unused now anyway, but may need to sample from host ?)
+  //   not in parallel
+
+  items.flags = 0;
+  items.threadId = 0;         // is master
+  items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
+}
+
+// This is called when all threads are started together in SPMD mode.
+// OMP directives include target parallel, target distribute parallel for, etc.
+INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
+    omptarget_nvptx_TaskDescr *parentTaskDescr) {
+  // slow method
+  // flag:
+  //   default sched is static,
+  //   dyn is off (unused now anyway, but may need to sample from host ?)
+  //   in L1 parallel
+
+  items.flags =
+      TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
+  items.threadId =
+      GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
+  items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
+  prev = parentTaskDescr;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyData(
+    omptarget_nvptx_TaskDescr *sourceTaskDescr) {
+  items = sourceTaskDescr->items;
+}
+
+INLINE void
+omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
+  CopyData(sourceTaskDescr);
+  prev = sourceTaskDescr->prev;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyParent(
+    omptarget_nvptx_TaskDescr *parentTaskDescr) {
+  CopyData(parentTaskDescr);
+  prev = parentTaskDescr;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
+    omptarget_nvptx_TaskDescr *parentTaskDescr) {
+  CopyParent(parentTaskDescr);
+  items.flags = items.flags & ~TaskDescr_IsParConstr;
+  ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
+    omptarget_nvptx_TaskDescr *masterTaskDescr) {
+  CopyParent(masterTaskDescr);
+  // overrwrite specific items;
+  items.flags |=
+      TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
+    omptarget_nvptx_TaskDescr *workTaskDescr) {
+  Copy(workTaskDescr);
+  //
+  // overrwrite specific items;
+  //
+  // The threadID should be GetThreadIdInBlock() % GetMasterThreadID().
+  // This is so that the serial master (first lane in the master warp)
+  // gets a threadId of 0.
+  // However, we know that this function is always called in a parallel
+  // region where only workers are active.  The serial master thread
+  // never enters this region.  When a parallel region is executed serially,
+  // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions
+  // are called, which never activate this region.
+  items.threadId =
+      GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
+    omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
+  CopyParent(parentTaskDescr);
+  items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
+  items.threadId = tid;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
+  loopData.loopUpperBound =
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
+  loopData.nextLowerBound =
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
+  loopData.schedule =
+      omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
+  loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
+  loopData.stride =
+      omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
+}
+
+INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
+  omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
+  omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
+      loopData.loopUpperBound;
+  omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
+      loopData.nextLowerBound;
+  omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
+      loopData.stride;
+  omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
+      loopData.schedule;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Thread Private Context
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omptarget_nvptx_TaskDescr *
+omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const {
+  ASSERT0(
+      LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
+      "Getting top level, tid is larger than allocated data structure size");
+  return topTaskDescr[tid];
+}
+
+INLINE void
+omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
+  // levelOneTaskDescr is init when starting the parallel region
+  // top task descr is NULL (team master version will be fixed separately)
+  topTaskDescr[tid] = NULL;
+  // no num threads value has been pushed
+  nextRegion.tnum[tid] = 0;
+  // the following don't need to be init here; they are init when using dyn
+  // sched
+  // current_Event, events_Number, chunk, num_Iterations, schedule
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Team Descriptor
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
+  levelZeroTaskDescr.InitLevelZeroTaskDescr();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Get private data structure for thread
+////////////////////////////////////////////////////////////////////////////////
+
+// Utility routines for CUDA threads
+INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
+  return omptarget_nvptx_threadPrivateContext->TeamContext();
+}
+
+INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
+  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+  return currTeamDescr.WorkDescr();
+}
+
+INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
+  return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+}
+
+INLINE omptarget_nvptx_TaskDescr *
+getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
+  return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Memory management runtime functions.
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
+  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
+          "SlotIdx is too big or uninitialized.");
+  ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
+          "MemIdx is too big or uninitialized.");
+  MemDataTy &MD = MemData[usedSlotIdx];
+  atomicExch((unsigned *)&MD.keys[usedMemIdx], 0);
+}
+
+INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
+                                                                size_t size) {
+  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
+          "SlotIdx is too big or uninitialized.");
+  const unsigned sm = usedSlotIdx;
+  MemDataTy &MD = MemData[sm];
+  unsigned i = hash(GetBlockIdInKernel());
+  while (atomicCAS((unsigned *)&MD.keys[i], 0, 1) != 0) {
+    i = hash(i + 1);
+  }
+  usedSlotIdx = sm;
+  usedMemIdx = i;
+  return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
+}