aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJon Chesterfield <jonathanchesterfield@gmail.com>2019-08-28 15:04:06 +0000
committerJon Chesterfield <jonathanchesterfield@gmail.com>2019-08-28 15:04:06 +0000
commit1fe4d6217edb8c431631aa39a1f75fd2519bb7eb (patch)
treef31720090e4a2f28ae2a846c63221f38ccaa456a
parentee031606128fec949b4f90152e25c37fdab13a8c (diff)
Use target_impl functions to replace more inline asm
Summary: Use target_impl functions to replace more inline asm Follow on from D65836. Removes remaining asm shuffles and lanemask accessors Also changes the types of target_impl bitwise functions to unsigned. Reviewers: jdoerfert, ABataev, grokos, Hahnfeld, gregrodgers, ronlieb, hfinkel Subscribers: openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D66809 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@370216 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/loop.cu6
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/parallel.cu22
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/reduction.cu23
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/supporti.h10
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/target_impl.h16
5 files changed, 38 insertions, 39 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu
index 4f24ada..3ed44f7 100644
--- a/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -381,7 +381,7 @@ public:
// Support for dispatch next
INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
- int lo, hi;
+ uint32_t lo, hi;
__kmpc_impl_unpack(val, lo, hi);
hi = __kmpc_impl_shfl_sync(active, hi, leader);
lo = __kmpc_impl_shfl_sync(active, lo, leader);
@@ -390,8 +390,8 @@ public:
INLINE static uint64_t NextIter() {
__kmpc_impl_lanemask_t active = __ACTIVEMASK();
- int leader = __kmpc_impl_ffs(active) - 1;
- int change = __kmpc_impl_popc(active);
+ uint32_t leader = __kmpc_impl_ffs(active) - 1;
+ uint32_t change = __kmpc_impl_popc(active);
__kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
uint64_t warp_res;
diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index 182a4f6..ee47cc4 100644
--- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -49,13 +49,12 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
int32_t *LaneId, int32_t *NumLanes) {
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
uint32_t ConvergentMask = Mask;
- int32_t ConvergentSize = __popc(ConvergentMask);
+ int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
- *LaneSource += __ffs(WorkRemaining);
- *IsFinal = __popc(WorkRemaining) == 1;
- uint32_t lanemask_lt;
- asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
- *LaneId = __popc(ConvergentMask & lanemask_lt);
+ *LaneSource += __kmpc_impl_ffs(WorkRemaining);
+ *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
+ uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
+ *LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
@@ -123,13 +122,12 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
int32_t *LaneSource) {
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
uint32_t ConvergentMask = Mask;
- int32_t ConvergentSize = __popc(ConvergentMask);
+ int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
- *LaneSource += __ffs(WorkRemaining);
- *IsFinal = __popc(WorkRemaining) == 1;
- uint32_t lanemask_lt;
- asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
- uint32_t OmpId = __popc(ConvergentMask & lanemask_lt);
+ *LaneSource += __kmpc_impl_ffs(WorkRemaining);
+ *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
+ uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
+ uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
diff --git a/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
index e5e76d5..ea53f61 100644
--- a/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -28,12 +28,11 @@ EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
}
EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
- int lo, hi;
- asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+ uint32_t lo, hi;
+ __kmpc_impl_unpack(val, lo, hi);
hi = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, hi, delta, size);
lo = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, lo, delta, size);
- asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
- return val;
+ return __kmpc_impl_pack(lo, hi);
}
INLINE static void gpu_regular_warp_reduce(void *reduce_data,
@@ -60,18 +59,16 @@ INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
INLINE static uint32_t
gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
- uint32_t lanemask_lt;
- uint32_t lanemask_gt;
uint32_t size, remote_id, physical_lane_id;
physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
- asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
+ uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
uint32_t Liveness = __ACTIVEMASK();
- uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2;
- asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt));
+ uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
+ uint32_t lanemask_gt = __kmpc_impl_lanemask_gt();
do {
Liveness = __ACTIVEMASK();
- remote_id = __ffs(Liveness & lanemask_gt);
- size = __popc(Liveness);
+ remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
+ size = __kmpc_impl_popc(Liveness);
logical_lane_id /= 2;
shflFct(reduce_data, /*LaneId =*/logical_lane_id,
/*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
@@ -150,7 +147,7 @@ static int32_t nvptx_parallel_reduce_nowait(
gpu_regular_warp_reduce(reduce_data, shflFct);
else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
gpu_irregular_warp_reduce(reduce_data, shflFct,
- /*LaneCount=*/__popc(Liveness),
+ /*LaneCount=*/__kmpc_impl_popc(Liveness),
/*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
// parallel region may enter here; return
@@ -325,7 +322,7 @@ static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
gpu_regular_warp_reduce(reduce_data, shflFct);
else // Partial warp but contiguous lanes
gpu_irregular_warp_reduce(reduce_data, shflFct,
- /*LaneCount=*/__popc(Liveness),
+ /*LaneCount=*/__kmpc_impl_popc(Liveness),
/*LaneId=*/ThreadId % WARPSIZE);
// When we have more than [warpsize] number of threads
diff --git a/libomptarget/deviceRTLs/nvptx/src/supporti.h b/libomptarget/deviceRTLs/nvptx/src/supporti.h
index c1a8467..884982d 100644
--- a/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -206,9 +206,8 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
INLINE void IncParallelLevel(bool ActiveParallel) {
unsigned Active = __ACTIVEMASK();
__kmpc_impl_syncwarp(Active);
- unsigned LaneMaskLt;
- asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
- unsigned Rank = __popc(Active & LaneMaskLt);
+ unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();
+ unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
if (Rank == 0) {
parallelLevel[GetWarpId()] +=
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
@@ -220,9 +219,8 @@ INLINE void IncParallelLevel(bool ActiveParallel) {
INLINE void DecParallelLevel(bool ActiveParallel) {
unsigned Active = __ACTIVEMASK();
__kmpc_impl_syncwarp(Active);
- unsigned LaneMaskLt;
- asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
- unsigned Rank = __popc(Active & LaneMaskLt);
+ unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();
+ unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
if (Rank == 0) {
parallelLevel[GetWarpId()] -=
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
diff --git a/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index b9f930d..8986195 100644
--- a/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -16,12 +16,12 @@
#include "option.h"
-INLINE void __kmpc_impl_unpack(int64_t val, int32_t &lo, int32_t &hi) {
+INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
}
-INLINE int64_t __kmpc_impl_pack(int32_t lo, int32_t hi) {
- int64_t val;
+INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
+ uint64_t val;
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
return val;
}
@@ -34,9 +34,15 @@ INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
return res;
}
-INLINE int __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
+INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
+ __kmpc_impl_lanemask_t res;
+ asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
+ return res;
+}
+
+INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
-INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); }
+INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
#ifndef CUDA_VERSION
#error CUDA_VERSION macro is undefined, something wrong with cuda.