diff options
Diffstat (limited to 'final/libomptarget/deviceRTLs/nvptx/src/reduction.cu')
-rw-r--r-- | final/libomptarget/deviceRTLs/nvptx/src/reduction.cu | 429 |
1 files changed, 429 insertions, 0 deletions
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/final/libomptarget/deviceRTLs/nvptx/src/reduction.cu new file mode 100644 index 0000000..b813a11 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -0,0 +1,429 @@ +//===---- reduction.cu - NVPTX OpenMP reduction implementation ---- CUDA +//-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of reduction with KMPC interface. +// +//===----------------------------------------------------------------------===// + +#include <complex.h> +#include <stdio.h> + +#include "omptarget-nvptx.h" + +// may eventually remove this +EXTERN +int32_t __gpu_block_reduce() { + int tid = GetLogicalThreadIdInBlock(); + int nt = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()); + if (nt != blockDim.x) + return 0; + unsigned tnum = __ACTIVEMASK(); + if (tnum != (~0x0)) // assume swapSize is 32 + return 0; + return 1; +} + +EXTERN +int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars, + size_t reduce_size, void *reduce_data, + void *reduce_array_size, kmp_ReductFctPtr *reductFct, + kmp_CriticalName *lck) { + int threadId = GetLogicalThreadIdInBlock(); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + int numthread; + if (currTaskDescr->IsParallelConstruct()) { + numthread = + GetNumberOfOmpThreads(threadId, isSPMDMode(), isRuntimeUninitialized()); + } else { + numthread = GetNumberOfOmpTeams(); + } + + if (numthread == 1) + return 1; + if (!__gpu_block_reduce()) + return 2; + if (threadIdx.x == 0) + return 1; + return 0; +} + +EXTERN +int32_t __kmpc_reduce_combined(kmp_Indent *loc) { + return threadIdx.x == 0 ? 2 : 0; +} + +EXTERN +int32_t __kmpc_reduce_simd(kmp_Indent *loc) { + return (threadIdx.x % 32 == 0) ? 1 : 0; +} + +EXTERN +void __kmpc_nvptx_end_reduce(int32_t global_tid) {} + +EXTERN +void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {} + +EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) { + return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size); +} + +EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) { + int lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); + hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size); + lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size); + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); + return val; +} + +static INLINE void gpu_regular_warp_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr shflFct) { + for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) { + shflFct(reduce_data, /*LaneId - not used= */ 0, + /*Offset = */ mask, /*AlgoVersion=*/0); + } +} + +static INLINE void gpu_irregular_warp_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + uint32_t size, uint32_t tid) { + uint32_t curr_size; + uint32_t mask; + curr_size = size; + mask = curr_size / 2; + while (mask > 0) { + shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); + curr_size = (curr_size + 1) / 2; + mask = curr_size / 2; + } +} + +static INLINE uint32_t +gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) { + uint32_t lanemask_lt; + uint32_t lanemask_gt; + uint32_t size, remote_id, physical_lane_id; + physical_lane_id = GetThreadIdInBlock() % WARPSIZE; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); + uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); + uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2; + asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt)); + do { + Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); + remote_id = __ffs(Liveness & lanemask_gt); + size = __popc(Liveness); + logical_lane_id /= 2; + shflFct(reduce_data, /*LaneId =*/logical_lane_id, + /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); + } while (logical_lane_id % 2 == 0 && size > 1); + return (logical_lane_id == 0); +} + +EXTERN +int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars, + size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct) { + uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); + if (Liveness == 0xffffffff) { + gpu_regular_warp_reduce(reduce_data, shflFct); + return GetThreadIdInBlock() % WARPSIZE == + 0; // Result on lane 0 of the simd warp. + } else { + return gpu_irregular_simd_reduce( + reduce_data, shflFct); // Result on the first active lane. + } +} + +INLINE +int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, + size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct, + bool isSPMDExecutionMode, + bool isRuntimeUninitialized = false) { + uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); + uint32_t NumThreads = GetNumberOfOmpThreads( + BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); + if (NumThreads == 1) + return 1; + /* + * This reduce function handles reduction within a team. It handles + * parallel regions in both L1 and L2 parallelism levels. It also + * supports Generic, SPMD, and NoOMP modes. + * + * 1. Reduce within a warp. + * 2. Warp master copies value to warp 0 via shared memory. + * 3. Warp 0 reduces to a single value. + * 4. The reduced value is available in the thread that returns 1. + */ + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; + uint32_t WarpId = BlockThreadId / WARPSIZE; + + // Volta execution model: + // For the Generic execution mode a parallel region either has 1 thread and + // beyond that, always a multiple of 32. For the SPMD execution mode we may + // have any number of threads. + if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1)) + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/NumThreads % WARPSIZE, + /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. + if (NumThreads > WARPSIZE) { + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); + + return BlockThreadId == 0; + } + return BlockThreadId == 0; +#else + uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); + if (Liveness == 0xffffffff) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/__popc(Liveness), + /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); + else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2 + // parallel region may enter here; return + // early. + return gpu_irregular_simd_reduce(reduce_data, shflFct); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. + if (NumThreads > WARPSIZE) { + uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + uint32_t WarpId = BlockThreadId / WARPSIZE; + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); + + return BlockThreadId == 0; + } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) { + return BlockThreadId == 0; + } + + // Get the OMP thread Id. This is different from BlockThreadId in the case of + // an L2 parallel region. + return GetOmpThreadId(BlockThreadId, isSPMDExecutionMode, + isRuntimeUninitialized) == 0; +#endif // __CUDA_ARCH__ >= 700 +} + +EXTERN +int32_t __kmpc_nvptx_parallel_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, + /*isSPMDExecutionMode=*/isSPMDMode()); +} + +EXTERN +int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, + /*isSPMDExecutionMode=*/true, + /*isRuntimeUninitialized=*/true); +} + +EXTERN +int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, + /*isSPMDExecutionMode=*/false, + /*isRuntimeUninitialized=*/true); +} + +INLINE +int32_t nvptx_teams_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct, + bool isSPMDExecutionMode, bool isRuntimeUninitialized = false) { + uint32_t ThreadId = GetLogicalThreadIdInBlock(); + // In non-generic mode all workers participate in the teams reduction. + // In generic mode only the team master participates in the teams + // reduction because the workers are waiting for parallel work. + uint32_t NumThreads = + isSPMDExecutionMode + ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true, + isRuntimeUninitialized) + : /*Master thread only*/ 1; + uint32_t TeamId = GetBlockIdInKernel(); + uint32_t NumTeams = GetNumberOfBlocksInKernel(); + __shared__ volatile bool IsLastTeam; + + // Team masters of all teams write to the scratchpad. + if (ThreadId == 0) { + unsigned int *timestamp = GetTeamsReductionTimestamp(); + char *scratchpad = GetTeamsReductionScratchpad(); + + scratchFct(reduce_data, scratchpad, TeamId, NumTeams); + __threadfence(); + + // atomicInc increments 'timestamp' and has a range [0, NumTeams-1]. + // It resets 'timestamp' back to 0 once the last team increments + // this counter. + unsigned val = atomicInc(timestamp, NumTeams - 1); + IsLastTeam = val == NumTeams - 1; + } + + // We have to wait on L1 barrier because in GENERIC mode the workers + // are waiting on barrier 0 for work. + // + // If we guard this barrier as follows it leads to deadlock, probably + // because of a compiler bug: if (!IsGenericMode()) __syncthreads(); + uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE; + named_sync(L1_BARRIER, SyncWarps * WARPSIZE); + + // If this team is not the last, quit. + if (/* Volatile read by all threads */ !IsLastTeam) + return 0; + + // + // Last team processing. + // + + // Threads in excess of #teams do not participate in reduction of the + // scratchpad values. +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + uint32_t ActiveThreads = NumThreads; + if (NumTeams < NumThreads) { + ActiveThreads = + (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1); + } + if (ThreadId >= ActiveThreads) + return 0; + + // Load from scratchpad and reduce. + char *scratchpad = GetTeamsReductionScratchpad(); + ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0); + for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads) + ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); + + uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; + uint32_t WarpId = ThreadId / WARPSIZE; + + // Reduce across warps to the warp master. + if ((ActiveThreads % WARPSIZE == 0) || + (WarpId < WarpsNeeded - 1)) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (ActiveThreads > 1) // Partial warp but contiguous lanes + // Only SPMD execution mode comes thru this case. + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/ActiveThreads % WARPSIZE, + /*LaneId=*/ThreadId % WARPSIZE); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + if (ActiveThreads > WARPSIZE) { + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); + } +#else + if (ThreadId >= NumTeams) + return 0; + + // Load from scratchpad and reduce. + char *scratchpad = GetTeamsReductionScratchpad(); + ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0); + for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads) + ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); + + // Reduce across warps to the warp master. + uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); + if (Liveness == 0xffffffff) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else // Partial warp but contiguous lanes + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/__popc(Liveness), + /*LaneId=*/ThreadId % WARPSIZE); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads; + if (ActiveThreads > WARPSIZE) { + uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + uint32_t WarpId = ThreadId / WARPSIZE; + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); + } +#endif // __CUDA_ARCH__ >= 700 + + return ThreadId == 0; +} + +EXTERN +int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars, + size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, + kmp_LoadReduceFctPtr ldFct) { + return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, scratchFct, + ldFct, /*isSPMDExecutionMode=*/isSPMDMode()); +} + +EXTERN +int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { + return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, scratchFct, + ldFct, + /*isSPMDExecutionMode=*/true, + /*isRuntimeUninitialized=*/true); +} + +EXTERN +int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { + return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, scratchFct, + ldFct, + /*isSPMDExecutionMode=*/false, + /*isRuntimeUninitialized=*/true); +} |