//===---- parallel.cu - NVPTX OpenMP parallel implementation ----- CUDA -*-===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.txt for details.
//
//===----------------------------------------------------------------------===//
//
// Parallel implemention in the GPU. Here is the pattern:
//
//    while (not finished) {
//
//    if (master) {
//      sequential code, decide which par loop to do, or if finished
//     __kmpc_kernel_prepare_parallel() // exec by master only
//    }
//    syncthreads // A
//    __kmpc_kernel_parallel() // exec by all
//    if (this thread is included in the parallel) {
//      switch () for all parallel loops
//      __kmpc_kernel_end_parallel() // exec only by threads in parallel
//    }
//
//
//    The reason we don't exec end_parallel for the threads not included
//    in the parallel loop is that for each barrier in the parallel
//    region, these non-included threads will cycle through the
//    syncthread A. Thus they must preserve their current threadId that
//    is larger than thread in team.
//
//    To make a long story short...
//
//===----------------------------------------------------------------------===//

#include "omptarget-nvptx.h"

typedef struct ConvergentSimdJob {
  omptarget_nvptx_TaskDescr taskDescr;
  omptarget_nvptx_TaskDescr *convHeadTaskDescr;
  uint16_t slimForNextSimd;
} ConvergentSimdJob;

////////////////////////////////////////////////////////////////////////////////
// support for convergent simd (team of threads in a warp only)
////////////////////////////////////////////////////////////////////////////////
EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
                                          bool *IsFinal, int32_t *LaneSource,
                                          int32_t *LaneId, int32_t *NumLanes) {
  PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
  uint32_t ConvergentMask = Mask;
  int32_t ConvergentSize = __popc(ConvergentMask);
  uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
  *LaneSource += __ffs(WorkRemaining);
  *IsFinal = __popc(WorkRemaining) == 1;
  uint32_t lanemask_lt;
  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
  *LaneId = __popc(ConvergentMask & lanemask_lt);

  int threadId = GetLogicalThreadIdInBlock();
  int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;

  ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
  int32_t SimdLimit =
      omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
  job->slimForNextSimd = SimdLimit;

  int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource);
  // reset simdlimit to avoid propagating to successive #simd
  if (SimdLimitSource > 0 && threadId == sourceThreadId)
    omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;

  // We cannot have more than the # of convergent threads.
  if (SimdLimitSource > 0)
    *NumLanes = min(ConvergentSize, SimdLimitSource);
  else
    *NumLanes = ConvergentSize;
  ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
         *NumLanes);

  // Set to true for lanes participating in the simd region.
  bool isActive = false;
  // Initialize state for active threads.
  if (*LaneId < *NumLanes) {
    omptarget_nvptx_TaskDescr *currTaskDescr =
        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
    omptarget_nvptx_TaskDescr *sourceTaskDescr =
        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
            sourceThreadId);
    job->convHeadTaskDescr = currTaskDescr;
    // install top descriptor from the thread for which the lanes are working.
    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
                                                               sourceTaskDescr);
    isActive = true;
  }

  // requires a memory fence between threads of a warp
  return isActive;
}

EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
  // pop stack
  int threadId = GetLogicalThreadIdInBlock();
  ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
  omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
      job->slimForNextSimd;
  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
      threadId, job->convHeadTaskDescr);
}

typedef struct ConvergentParallelJob {
  omptarget_nvptx_TaskDescr taskDescr;
  omptarget_nvptx_TaskDescr *convHeadTaskDescr;
  uint16_t tnumForNextPar;
} ConvergentParallelJob;

////////////////////////////////////////////////////////////////////////////////
// support for convergent parallelism (team of threads in a warp only)
////////////////////////////////////////////////////////////////////////////////
EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
                                              bool *IsFinal,
                                              int32_t *LaneSource) {
  PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
  uint32_t ConvergentMask = Mask;
  int32_t ConvergentSize = __popc(ConvergentMask);
  uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
  *LaneSource += __ffs(WorkRemaining);
  *IsFinal = __popc(WorkRemaining) == 1;
  uint32_t lanemask_lt;
  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
  uint32_t OmpId = __popc(ConvergentMask & lanemask_lt);

  int threadId = GetLogicalThreadIdInBlock();
  int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;

  ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
  int32_t NumThreadsClause =
      omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
  job->tnumForNextPar = NumThreadsClause;

  int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource);
  // reset numthreads to avoid propagating to successive #parallel
  if (NumThreadsSource > 0 && threadId == sourceThreadId)
    omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
        0;

  // We cannot have more than the # of convergent threads.
  uint16_t NumThreads;
  if (NumThreadsSource > 0)
    NumThreads = min(ConvergentSize, NumThreadsSource);
  else
    NumThreads = ConvergentSize;
  ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
         NumThreads);

  // Set to true for workers participating in the parallel region.
  bool isActive = false;
  // Initialize state for active threads.
  if (OmpId < NumThreads) {
    // init L2 task descriptor and storage for the L1 parallel task descriptor.
    omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr;
    ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
    omptarget_nvptx_TaskDescr *currTaskDescr =
        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
    omptarget_nvptx_TaskDescr *sourceTaskDescr =
        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
            sourceThreadId);
    job->convHeadTaskDescr = currTaskDescr;
    newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads);
    // install new top descriptor
    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
                                                               newTaskDescr);
    isActive = true;
  }

  // requires a memory fence between threads of a warp
  return isActive;
}

EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
  // pop stack
  int threadId = GetLogicalThreadIdInBlock();
  ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
      threadId, job->convHeadTaskDescr);
  omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
      job->tnumForNextPar;
}

////////////////////////////////////////////////////////////////////////////////
// support for parallel that goes parallel (1 static level only)
////////////////////////////////////////////////////////////////////////////////

// return number of cuda threads that participate to parallel
// calculation has to consider simd implementation in nvptx
// i.e. (num omp threads * num lanes)
//
// cudathreads =
//    if(num_threads != 0) {
//      if(thread_limit > 0) {
//        min (num_threads*numLanes ; thread_limit*numLanes);
//      } else {
//        min (num_threads*numLanes; blockDim.x)
//      }
//    } else {
//      if (thread_limit != 0) {
//        min (thread_limit*numLanes; blockDim.x)
//      } else { // no thread_limit, no num_threads, use all cuda threads
//        blockDim.x;
//      }
//    }
//
// This routine is always called by the team master..
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
                                           int16_t IsOMPRuntimeInitialized) {
  PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
  omptarget_nvptx_workFn = WorkFn;

  if (!IsOMPRuntimeInitialized)
    return;

  // This routine is only called by the team master.  The team master is
  // the first thread of the last warp.  It always has the logical thread
  // id of 0 (since it is a shadow for the first worker thread).
  int threadId = 0;
  omptarget_nvptx_TaskDescr *currTaskDescr =
      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
  ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
  ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
          "cannot be called in a parallel region.");
  if (currTaskDescr->InParallelRegion()) {
    PRINT0(LD_PAR, "already in parallel: go seq\n");
    return;
  }

  uint16_t CudaThreadsForParallel = 0;
  uint16_t NumThreadsClause =
      omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);

  // we cannot have more than block size
  uint16_t CudaThreadsAvail = GetNumberOfWorkersInTeam();

  // currTaskDescr->ThreadLimit(): If non-zero, this is the limit as
  // specified by the thread_limit clause on the target directive.
  // GetNumberOfWorkersInTeam(): This is the number of workers available
  // in this kernel instance.
  //
  // E.g: If thread_limit is 33, the kernel is launched with 33+32=65
  // threads.  The last warp is the master warp so in this case
  // GetNumberOfWorkersInTeam() returns 64.

  // this is different from ThreadAvail of OpenMP because we may be
  // using some of the CUDA threads as SIMD lanes
  int NumLanes = 1;
  if (NumThreadsClause != 0) {
    // reset request to avoid propagating to successive #parallel
    omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
        0;

    // assume that thread_limit*numlanes is already <= CudaThreadsAvail
    // because that is already checked on the host side (CUDA offloading rtl)
    if (currTaskDescr->ThreadLimit() != 0)
      CudaThreadsForParallel =
          NumThreadsClause * NumLanes < currTaskDescr->ThreadLimit() * NumLanes
              ? NumThreadsClause * NumLanes
              : currTaskDescr->ThreadLimit() * NumLanes;
    else {
      CudaThreadsForParallel = (NumThreadsClause * NumLanes > CudaThreadsAvail)
                                   ? CudaThreadsAvail
                                   : NumThreadsClause * NumLanes;
    }
  } else {
    if (currTaskDescr->ThreadLimit() != 0) {
      CudaThreadsForParallel =
          (currTaskDescr->ThreadLimit() * NumLanes > CudaThreadsAvail)
              ? CudaThreadsAvail
              : currTaskDescr->ThreadLimit() * NumLanes;
    } else
      CudaThreadsForParallel = CudaThreadsAvail;
  }

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
  // On Volta and newer architectures we require that all lanes in
  // a warp participate in the parallel region.  Round down to a
  // multiple of WARPSIZE since it is legal to do so in OpenMP.
  // CudaThreadsAvail is the number of workers available in this
  // kernel instance and is greater than or equal to
  // currTaskDescr->ThreadLimit().
  if (CudaThreadsForParallel < CudaThreadsAvail) {
    CudaThreadsForParallel =
        (CudaThreadsForParallel < WARPSIZE)
            ? 1
            : CudaThreadsForParallel & ~((uint16_t)WARPSIZE - 1);
  }
#endif

  ASSERT(LT_FUSSY, CudaThreadsForParallel > 0,
         "bad thread request of %d threads", CudaThreadsForParallel);
  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
          "only team master can create parallel");

  // set number of threads on work descriptor
  // this is different from the number of cuda threads required for the parallel
  // region
  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
  workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr,
                                             CudaThreadsForParallel / NumLanes);
  // init counters (copy start to init)
  workDescr.CounterGroup().Reset();
}

// All workers call this function.  Deactivate those not needed.
// Fn - the outlined work function to execute.
// returns True if this thread is active, else False.
//
// Only the worker threads call this routine.
EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
                                   int16_t IsOMPRuntimeInitialized) {
  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");

  // Work function and arguments for L1 parallel region.
  *WorkFn = omptarget_nvptx_workFn;

  if (!IsOMPRuntimeInitialized)
    return true;

  // If this is the termination signal from the master, quit early.
  if (!*WorkFn)
    return false;

  // Only the worker threads call this routine and the master warp
  // never arrives here.  Therefore, use the nvptx thread id.
  int threadId = GetThreadIdInBlock();
  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
  // Set to true for workers participating in the parallel region.
  bool isActive = false;
  // Initialize state for active threads.
  if (threadId < workDescr.WorkTaskDescr()->ThreadsInTeam()) {
    // init work descriptor from workdesccr
    omptarget_nvptx_TaskDescr *newTaskDescr =
        omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
    ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
    newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
    // install new top descriptor
    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
                                                               newTaskDescr);
    // init private from int value
    workDescr.CounterGroup().Init(
        omptarget_nvptx_threadPrivateContext->Priv(threadId));
    PRINT(LD_PAR,
          "thread will execute parallel region with id %d in a team of "
          "%d threads\n",
          newTaskDescr->ThreadId(), newTaskDescr->NThreads());

    isActive = true;
  }

  return isActive;
}

EXTERN void __kmpc_kernel_end_parallel() {
  // pop stack
  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
  // Only the worker threads call this routine and the master warp
  // never arrives here.  Therefore, use the nvptx thread id.
  int threadId = GetThreadIdInBlock();
  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
      threadId, currTaskDescr->GetPrevTaskDescr());
}

////////////////////////////////////////////////////////////////////////////////
// support for parallel that goes sequential
////////////////////////////////////////////////////////////////////////////////

EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) {
  PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");

  // assume this is only called for nested parallel
  int threadId = GetLogicalThreadIdInBlock();

  // unlike actual parallel, threads in the same team do not share
  // the workTaskDescr in this case and num threads is fixed to 1

  // get current task
  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
  currTaskDescr->SaveLoopData();

  // allocate new task descriptor and copy value from current one, set prev to
  // it
  omptarget_nvptx_TaskDescr *newTaskDescr =
      (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
                                              (char *)"new seq parallel task");
  newTaskDescr->CopyParent(currTaskDescr);

  // tweak values for serialized parallel case:
  // - each thread becomes ID 0 in its serialized parallel, and
  // - there is only one thread per team
  newTaskDescr->ThreadId() = 0;
  newTaskDescr->ThreadsInTeam() = 1;

  // set new task descriptor as top
  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
                                                             newTaskDescr);
}

EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc,
                                           uint32_t global_tid) {
  PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");

  // pop stack
  int threadId = GetLogicalThreadIdInBlock();
  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
  // set new top
  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
      threadId, currTaskDescr->GetPrevTaskDescr());
  // free
  SafeFree(currTaskDescr, (char *)"new seq parallel task");
  currTaskDescr = getMyTopTaskDescriptor(threadId);
  currTaskDescr->RestoreLoopData();
}

EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) {
  PRINT0(LD_IO, "call to __kmpc_parallel_level\n");

  int threadId = GetLogicalThreadIdInBlock();
  omptarget_nvptx_TaskDescr *currTaskDescr =
      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
  if (currTaskDescr->InL2OrHigherParallelRegion())
    return 2;
  else if (currTaskDescr->InParallelRegion())
    return 1;
  else
    return 0;
}

// This kmpc call returns the thread id across all teams. It's value is
// cached by the compiler and used when calling the runtime. On nvptx
// it's cheap to recalculate this value so we never use the result
// of this call.
EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc) {
  return GetLogicalThreadIdInBlock();
}

////////////////////////////////////////////////////////////////////////////////
// push params
////////////////////////////////////////////////////////////////////////////////

EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t tid,
                                    int32_t num_threads) {
  PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
  tid = GetLogicalThreadIdInBlock();
  omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
      num_threads;
}

EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t tid,
                                   int32_t simd_limit) {
  PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit);
  tid = GetLogicalThreadIdInBlock();
  omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
}

// Do nothing. The host guarantees we started the requested number of
// teams and we only need inspection of gridDim.

EXTERN void __kmpc_push_num_teams(kmp_Indent *loc, int32_t tid,
                                  int32_t num_teams, int32_t thread_limit) {
  PRINT(LD_IO, "call kmpc_push_num_teams %d\n", num_teams);
  ASSERT0(LT_FUSSY, FALSE,
          "should never have anything with new teams on device");
}

EXTERN void __kmpc_push_proc_bind(kmp_Indent *loc, uint32_t tid,
                                  int proc_bind) {
  PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", proc_bind);
}