1 files changed, 441 insertions, 0 deletions
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
new file mode 100644
index 0000000..84c61f9
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -0,0 +1,441 @@
+//===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of all library macros, types,
+// and functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPTARGET_NVPTX_H
+#define __OMPTARGET_NVPTX_H
+
+// std includes
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <inttypes.h>
+
+// cuda includes
+#include <cuda.h>
+#include <math.h>
+
+// local includes
+#include "counter_group.h"
+#include "debug.h"     // debug
+#include "interface.h" // interfaces with omp, compiler, and user
+#include "option.h"    // choices we have
+#include "state-queue.h"
+#include "support.h"
+
+#define OMPTARGET_NVPTX_VERSION 1.1
+
+// used by the library for the interface with the app
+#define DISPATCH_FINISHED 0
+#define DISPATCH_NOTFINISHED 1
+
+// used by dynamic scheduling
+#define FINISHED 0
+#define NOT_FINISHED 1
+#define LAST_CHUNK 2
+
+#define BARRIER_COUNTER 0
+#define ORDERED_COUNTER 1
+
+// Macros for Cuda intrinsics
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask().
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
+#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
+#define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
+  __shfl_down_sync((mask), (var), (delta), (width))
+#define __BALLOT_SYNC(mask, predicate) __ballot_sync((mask), (predicate))
+#define __ACTIVEMASK() __activemask()
+#else
+#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
+#define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
+  __shfl_down((var), (delta), (width))
+#define __BALLOT_SYNC(mask, predicate) __ballot((predicate))
+#define __ACTIVEMASK() __ballot(1)
+#endif
+
+// arguments needed for L0 parallelism only.
+class omptarget_nvptx_SharedArgs {
+public:
+  // All these methods must be called by the master thread only.
+  INLINE void Init() {
+    args  = buffer;
+    nArgs = MAX_SHARED_ARGS;
+  }
+  INLINE void DeInit() {
+    // Free any memory allocated for outlined parallel function with a large
+    // number of arguments.
+    if (nArgs > MAX_SHARED_ARGS) {
+      SafeFree(args, (char *)"new extended args");
+      Init();
+    }
+  }
+  INLINE void EnsureSize(size_t size) {
+    if (size > nArgs) {
+      if (nArgs > MAX_SHARED_ARGS) {
+        SafeFree(args, (char *)"new extended args");
+      }
+      args = (void **) SafeMalloc(size * sizeof(void *),
+                                  (char *)"new extended args");
+      nArgs = size;
+    }
+  }
+  // Called by all threads.
+  INLINE void **GetArgs() { return args; };
+private:
+  // buffer of pre-allocated arguments.
+  void *buffer[MAX_SHARED_ARGS];
+  // pointer to arguments buffer.
+  // starts off as a pointer to 'buffer' but can be dynamically allocated.
+  void **args;
+  // starts off as MAX_SHARED_ARGS but can increase in size.
+  uint32_t nArgs;
+};
+
+extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+
+// Data sharing related quantities, need to match what is used in the compiler.
+enum DATA_SHARING_SIZES {
+  // The maximum number of workers in a kernel.
+  DS_Max_Worker_Threads = 992,
+  // The size reserved for data in a shared memory slot.
+  DS_Slot_Size = 256,
+  // The slot size that should be reserved for a working warp.
+  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
+  // The maximum number of warps in use
+  DS_Max_Warp_Number = 32,
+};
+
+// Data structure to keep in shared memory that traces the current slot, stack,
+// and frame pointer as well as the active threads that didn't exit the current
+// environment.
+struct DataSharingStateTy {
+  __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
+  void *StackPtr[DS_Max_Warp_Number];
+  void *FramePtr[DS_Max_Warp_Number];
+  int32_t ActiveThreads[DS_Max_Warp_Number];
+};
+// Additional worker slot type which is initialized with the default worker slot
+// size of 4*32 bytes.
+struct __kmpc_data_sharing_worker_slot_static {
+  __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
+  void *PrevSlotStackPtr;
+  void *DataEnd;
+  char Data[DS_Worker_Warp_Slot_Size];
+};
+// Additional master slot type which is initialized with the default master slot
+// size of 4 bytes.
+struct __kmpc_data_sharing_master_slot_static {
+  __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
+  void *PrevSlotStackPtr;
+  void *DataEnd;
+  char Data[DS_Slot_Size];
+};
+extern __device__ __shared__ DataSharingStateTy DataSharingState;
+
+////////////////////////////////////////////////////////////////////////////////
+// task ICV and (implicit & explicit) task state
+
+class omptarget_nvptx_TaskDescr {
+public:
+  // methods for flags
+  INLINE omp_sched_t GetRuntimeSched();
+  INLINE void SetRuntimeSched(omp_sched_t sched);
+  INLINE int IsDynamic() { return items.flags & TaskDescr_IsDynamic; }
+  INLINE void SetDynamic() {
+    items.flags = items.flags | TaskDescr_IsDynamic;
+  }
+  INLINE void ClearDynamic() {
+    items.flags = items.flags & (~TaskDescr_IsDynamic);
+  }
+  INLINE int InParallelRegion() { return items.flags & TaskDescr_InPar; }
+  INLINE int InL2OrHigherParallelRegion() {
+    return items.flags & TaskDescr_InParL2P;
+  }
+  INLINE int IsParallelConstruct() {
+    return items.flags & TaskDescr_IsParConstr;
+  }
+  INLINE int IsTaskConstruct() { return !IsParallelConstruct(); }
+  // methods for other fields
+  INLINE uint16_t &NThreads() { return items.nthreads; }
+  INLINE uint16_t &ThreadLimit() { return items.threadlimit; }
+  INLINE uint16_t &ThreadId() { return items.threadId; }
+  INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; }
+  INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
+  INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; }
+  INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
+    prev = taskDescr;
+  }
+  // init & copy
+  INLINE void InitLevelZeroTaskDescr();
+  INLINE void InitLevelOneTaskDescr(uint16_t tnum,
+                                    omptarget_nvptx_TaskDescr *parentTaskDescr);
+  INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
+  INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
+  INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
+  INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
+  INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr,
+                              uint16_t tnum);
+  INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
+  INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
+                                   uint16_t tid, uint16_t tnum);
+  INLINE void SaveLoopData();
+  INLINE void RestoreLoopData() const;
+
+private:
+  // bits for flags: (7 used, 1 free)
+  //   3 bits (SchedMask) for runtime schedule
+  //   1 bit (IsDynamic) for dynamic schedule (false = static)
+  //   1 bit (InPar) if this thread has encountered one or more parallel region
+  //   1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
+  //   1 bit (InParL2+) if this thread has encountered L2 or higher parallel
+  //   region
+  static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4);
+  static const uint8_t TaskDescr_IsDynamic = 0x8;
+  static const uint8_t TaskDescr_InPar = 0x10;
+  static const uint8_t TaskDescr_IsParConstr = 0x20;
+  static const uint8_t TaskDescr_InParL2P = 0x40;
+
+  struct SavedLoopDescr_items {
+    int64_t loopUpperBound;
+    int64_t nextLowerBound;
+    int64_t chunk;
+    int64_t stride;
+    kmp_sched_t schedule;
+  } loopData;
+
+  struct TaskDescr_items {
+    uint8_t flags; // 6 bit used (see flag above)
+    uint8_t unused;
+    uint16_t nthreads;         // thread num for subsequent parallel regions
+    uint16_t threadlimit;      // thread limit ICV
+    uint16_t threadId;         // thread id
+    uint16_t threadsInTeam;    // threads in current team
+    uint64_t runtimeChunkSize; // runtime chunk size
+  } items;
+  omptarget_nvptx_TaskDescr *prev;
+};
+
+// build on kmp
+typedef struct omptarget_nvptx_ExplicitTaskDescr {
+  omptarget_nvptx_TaskDescr
+      taskDescr; // omptarget_nvptx task description (must be first)
+  kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last)
+} omptarget_nvptx_ExplicitTaskDescr;
+
+////////////////////////////////////////////////////////////////////////////////
+// Descriptor of a parallel region (worksharing in general)
+
+class omptarget_nvptx_WorkDescr {
+
+public:
+  // access to data
+  INLINE omptarget_nvptx_CounterGroup &CounterGroup() { return cg; }
+  INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; }
+  // init
+  INLINE void InitWorkDescr();
+
+private:
+  omptarget_nvptx_CounterGroup cg; // for barrier (no other needed)
+  omptarget_nvptx_TaskDescr masterTaskICV;
+  bool hasCancel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+class omptarget_nvptx_TeamDescr {
+public:
+  // access to data
+  INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
+    return &levelZeroTaskDescr;
+  }
+  INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
+    return workDescrForActiveParallel;
+  }
+  INLINE omp_lock_t *CriticalLock() { return &criticalLock; }
+  INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
+
+  // init
+  INLINE void InitTeamDescr();
+
+  INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
+    // If this is invoked by the master thread of the master warp then intialize
+    // it with a smaller slot.
+    if (IsMasterThread) {
+      // Do not initalize this slot again if it has already been initalized.
+      if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size)
+        return 0;
+      // Initialize the pointer to the end of the slot given the size of the
+      // data section. DataEnd is non-inclusive.
+      master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size;
+      // We currently do not have a next slot.
+      master_rootS[0].Next = 0;
+      master_rootS[0].Prev = 0;
+      master_rootS[0].PrevSlotStackPtr = 0;
+      return (__kmpc_data_sharing_slot *)&master_rootS[0];
+    }
+    // Do not initalize this slot again if it has already been initalized.
+    if (worker_rootS[wid].DataEnd ==
+        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size)
+      return 0;
+    // Initialize the pointer to the end of the slot given the size of the data
+    // section. DataEnd is non-inclusive.
+    worker_rootS[wid].DataEnd =
+        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
+    // We currently do not have a next slot.
+    worker_rootS[wid].Next = 0;
+    worker_rootS[wid].Prev = 0;
+    worker_rootS[wid].PrevSlotStackPtr = 0;
+    return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
+  }
+
+  INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
+    worker_rootS[wid].DataEnd =
+        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
+    // We currently do not have a next slot.
+    worker_rootS[wid].Next = 0;
+    worker_rootS[wid].Prev = 0;
+    worker_rootS[wid].PrevSlotStackPtr = 0;
+    return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
+  }
+
+private:
+  omptarget_nvptx_TaskDescr
+      levelZeroTaskDescr; // icv for team master initial thread
+  omptarget_nvptx_WorkDescr
+      workDescrForActiveParallel; // one, ONLY for the active par
+  omp_lock_t criticalLock;
+  uint64_t lastprivateIterBuffer;
+
+  __align__(16)
+      __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
+  __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// thread private data (struct of arrays for better coalescing)
+// tid refers here to the global thread id
+// do not support multiple concurrent kernel a this time
+class omptarget_nvptx_ThreadPrivateContext {
+public:
+  // task
+  INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
+    return &levelOneTaskDescr[tid];
+  }
+  INLINE void SetTopLevelTaskDescr(int tid,
+                                   omptarget_nvptx_TaskDescr *taskICV) {
+    topTaskDescr[tid] = taskICV;
+  }
+  INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid);
+  // parallel
+  INLINE uint16_t &NumThreadsForNextParallel(int tid) {
+    return nextRegion.tnum[tid];
+  }
+  // simd
+  INLINE uint16_t &SimdLimitForNextSimd(int tid) {
+    return nextRegion.slim[tid];
+  }
+  // sync
+  INLINE Counter &Priv(int tid) { return priv[tid]; }
+  INLINE void IncrementPriv(int tid, Counter val) { priv[tid] += val; }
+  // schedule (for dispatch)
+  INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
+  INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
+  INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
+  INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
+  INLINE int64_t &Stride(int tid) { return stride[tid]; }
+
+  INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
+
+  INLINE void InitThreadPrivateContext(int tid);
+  INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; }
+  INLINE uint64_t GetSourceQueue() { return SourceQueue; }
+
+private:
+  // team context for this team
+  omptarget_nvptx_TeamDescr teamContext;
+  // task ICV for implict threads in the only parallel region
+  omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
+  // pointer where to find the current task ICV (top of the stack)
+  omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
+  union {
+    // Only one of the two is live at the same time.
+    // parallel
+    uint16_t tnum[MAX_THREADS_PER_TEAM];
+    // simd limit
+    uint16_t slim[MAX_THREADS_PER_TEAM];
+  } nextRegion;
+  // sync
+  Counter priv[MAX_THREADS_PER_TEAM];
+  // schedule (for dispatch)
+  kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
+  int64_t chunk[MAX_THREADS_PER_TEAM];
+  int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
+  // state for dispatch with dyn/guided OR static (never use both at a time)
+  int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
+  int64_t stride[MAX_THREADS_PER_TEAM];
+  // Queue to which this object must be returned.
+  uint64_t SourceQueue;
+};
+
+/// Device envrionment data
+struct omptarget_device_environmentTy {
+  int32_t debug_level;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// global device envrionment
+////////////////////////////////////////////////////////////////////////////////
+
+extern __device__ omptarget_device_environmentTy omptarget_device_environment;
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// global data tables
+////////////////////////////////////////////////////////////////////////////////
+
+extern __device__ __shared__
+    omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+extern __device__ __shared__ uint32_t execution_param;
+extern __device__ __shared__ void *ReductionScratchpadPtr;
+
+////////////////////////////////////////////////////////////////////////////////
+// work function (outlined parallel/simd functions) and arguments.
+// needed for L1 parallelism only.
+////////////////////////////////////////////////////////////////////////////////
+
+typedef void *omptarget_nvptx_WorkFn;
+extern volatile __device__ __shared__ omptarget_nvptx_WorkFn
+    omptarget_nvptx_workFn;
+
+////////////////////////////////////////////////////////////////////////////////
+// get private data structures
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
+INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
+INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor();
+INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
+
+////////////////////////////////////////////////////////////////////////////////
+// inlined implementation
+////////////////////////////////////////////////////////////////////////////////
+
+#include "counter_groupi.h"
+#include "omptarget-nvptxi.h"
+#include "supporti.h"
+
+#endif