aboutsummaryrefslogtreecommitdiff
path: root/final/libomptarget/plugins/cuda/src/rtl.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'final/libomptarget/plugins/cuda/src/rtl.cpp')
-rw-r--r--final/libomptarget/plugins/cuda/src/rtl.cpp763
1 files changed, 763 insertions, 0 deletions
diff --git a/final/libomptarget/plugins/cuda/src/rtl.cpp b/final/libomptarget/plugins/cuda/src/rtl.cpp
new file mode 100644
index 0000000..d265a87
--- /dev/null
+++ b/final/libomptarget/plugins/cuda/src/rtl.cpp
@@ -0,0 +1,763 @@
+//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for CUDA machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstddef>
+#include <cuda.h>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "omptargetplugin.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME CUDA
+#endif
+
+#ifdef OMPTARGET_DEBUG
+static int DebugLevel = 0;
+
+#define GETNAME2(name) #name
+#define GETNAME(name) GETNAME2(name)
+#define DP(...) \
+ do { \
+ if (DebugLevel > 0) { \
+ DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
+ } \
+ } while (false)
+#else // OMPTARGET_DEBUG
+#define DP(...) {}
+#endif // OMPTARGET_DEBUG
+
+#include "../../common/elf_common.c"
+
+// Utility for retrieving and printing CUDA error string.
+#ifdef CUDA_ERROR_REPORT
+#define CUDA_ERR_STRING(err) \
+ do { \
+ const char *errStr; \
+ cuGetErrorString(err, &errStr); \
+ DP("CUDA error is: %s\n", errStr); \
+ } while (0)
+#else
+#define CUDA_ERR_STRING(err) \
+ {}
+#endif
+
+/// Keep entries table per device.
+struct FuncOrGblEntryTy {
+ __tgt_target_table Table;
+ std::vector<__tgt_offload_entry> Entries;
+};
+
+enum ExecutionModeType {
+ SPMD, // constructors, destructors,
+ // combined constructs (`teams distribute parallel for [simd]`)
+ GENERIC, // everything else
+ NONE
+};
+
+/// Use a single entity to encode a kernel and a set of flags
+struct KernelTy {
+ CUfunction Func;
+
+ // execution mode of kernel
+ // 0 - SPMD mode (without master warp)
+ // 1 - Generic mode (with master warp)
+ int8_t ExecutionMode;
+
+ KernelTy(CUfunction _Func, int8_t _ExecutionMode)
+ : Func(_Func), ExecutionMode(_ExecutionMode) {}
+};
+
+/// Device envrionment data
+/// Manually sync with the deviceRTL side for now, move to a dedicated header file later.
+struct omptarget_device_environmentTy {
+ int32_t debug_level;
+};
+
+/// List that contains all the kernels.
+/// FIXME: we may need this to be per device and per library.
+std::list<KernelTy> KernelsList;
+
+/// Class containing all the device information.
+class RTLDeviceInfoTy {
+ std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
+
+public:
+ int NumberOfDevices;
+ std::vector<CUmodule> Modules;
+ std::vector<CUcontext> Contexts;
+
+ // Device properties
+ std::vector<int> ThreadsPerBlock;
+ std::vector<int> BlocksPerGrid;
+ std::vector<int> WarpSize;
+
+ // OpenMP properties
+ std::vector<int> NumTeams;
+ std::vector<int> NumThreads;
+
+ // OpenMP Environment properties
+ int EnvNumTeams;
+ int EnvTeamLimit;
+
+ //static int EnvNumThreads;
+ static const int HardTeamLimit = 1<<16; // 64k
+ static const int HardThreadLimit = 1024;
+ static const int DefaultNumTeams = 128;
+ static const int DefaultNumThreads = 128;
+
+ // Record entry point associated with device
+ void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ E.Entries.push_back(entry);
+ }
+
+ // Return true if the entry is associated with device
+ bool findOffloadEntry(int32_t device_id, void *addr) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ for (auto &it : E.Entries) {
+ if (it.addr == addr)
+ return true;
+ }
+
+ return false;
+ }
+
+ // Return the pointer to the target entries table
+ __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ int32_t size = E.Entries.size();
+
+ // Table is empty
+ if (!size)
+ return 0;
+
+ __tgt_offload_entry *begin = &E.Entries[0];
+ __tgt_offload_entry *end = &E.Entries[size - 1];
+
+ // Update table info according to the entries and return the pointer
+ E.Table.EntriesBegin = begin;
+ E.Table.EntriesEnd = ++end;
+
+ return &E.Table;
+ }
+
+ // Clear entries table for a device
+ void clearOffloadEntriesTable(int32_t device_id) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncGblEntries[device_id].emplace_back();
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+ E.Entries.clear();
+ E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
+ }
+
+ RTLDeviceInfoTy() {
+#ifdef OMPTARGET_DEBUG
+ if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
+ DebugLevel = std::stoi(envStr);
+ }
+#endif // OMPTARGET_DEBUG
+
+ DP("Start initializing CUDA\n");
+
+ CUresult err = cuInit(0);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when initializing CUDA\n");
+ CUDA_ERR_STRING(err);
+ return;
+ }
+
+ NumberOfDevices = 0;
+
+ err = cuDeviceGetCount(&NumberOfDevices);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when getting CUDA device count\n");
+ CUDA_ERR_STRING(err);
+ return;
+ }
+
+ if (NumberOfDevices == 0) {
+ DP("There are no devices supporting CUDA.\n");
+ return;
+ }
+
+ FuncGblEntries.resize(NumberOfDevices);
+ Contexts.resize(NumberOfDevices);
+ ThreadsPerBlock.resize(NumberOfDevices);
+ BlocksPerGrid.resize(NumberOfDevices);
+ WarpSize.resize(NumberOfDevices);
+ NumTeams.resize(NumberOfDevices);
+ NumThreads.resize(NumberOfDevices);
+
+ // Get environment variables regarding teams
+ char *envStr = getenv("OMP_TEAM_LIMIT");
+ if (envStr) {
+ // OMP_TEAM_LIMIT has been set
+ EnvTeamLimit = std::stoi(envStr);
+ DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
+ } else {
+ EnvTeamLimit = -1;
+ }
+ envStr = getenv("OMP_NUM_TEAMS");
+ if (envStr) {
+ // OMP_NUM_TEAMS has been set
+ EnvNumTeams = std::stoi(envStr);
+ DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
+ } else {
+ EnvNumTeams = -1;
+ }
+ }
+
+ ~RTLDeviceInfoTy() {
+ // Close modules
+ for (auto &module : Modules)
+ if (module) {
+ CUresult err = cuModuleUnload(module);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when unloading CUDA module\n");
+ CUDA_ERR_STRING(err);
+ }
+ }
+
+ // Destroy contexts
+ for (auto &ctx : Contexts)
+ if (ctx) {
+ CUresult err = cuCtxDestroy(ctx);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when destroying CUDA context\n");
+ CUDA_ERR_STRING(err);
+ }
+ }
+ }
+};
+
+static RTLDeviceInfoTy DeviceInfo;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+ return elf_check_machine(image, 190); // EM_CUDA = 190.
+}
+
+int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
+
+int32_t __tgt_rtl_init_device(int32_t device_id) {
+
+ CUdevice cuDevice;
+ DP("Getting device %d\n", device_id);
+ CUresult err = cuDeviceGet(&cuDevice, device_id);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when getting CUDA device with id = %d\n", device_id);
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ // Create the context and save it to use whenever this device is selected.
+ err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
+ cuDevice);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when creating a CUDA context\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ // Query attributes to determine number of threads/block and blocks/grid.
+ int maxGridDimX;
+ err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+ cuDevice);
+ if (err != CUDA_SUCCESS) {
+ DP("Error getting max grid dimension, use default\n");
+ DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
+ } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) {
+ DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX;
+ DP("Using %d CUDA blocks per grid\n", maxGridDimX);
+ } else {
+ DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
+ DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
+ "at the hard limit\n",
+ maxGridDimX, RTLDeviceInfoTy::HardTeamLimit);
+ }
+
+ // We are only exploiting threads along the x axis.
+ int maxBlockDimX;
+ err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+ cuDevice);
+ if (err != CUDA_SUCCESS) {
+ DP("Error getting max block dimension, use default\n");
+ DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
+ } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) {
+ DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX;
+ DP("Using %d CUDA threads per block\n", maxBlockDimX);
+ } else {
+ DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
+ DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping"
+ "at the hard limit\n",
+ maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit);
+ }
+
+ int warpSize;
+ err =
+ cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
+ if (err != CUDA_SUCCESS) {
+ DP("Error getting warp size, assume default\n");
+ DeviceInfo.WarpSize[device_id] = 32;
+ } else {
+ DeviceInfo.WarpSize[device_id] = warpSize;
+ }
+
+ // Adjust teams to the env variables
+ if (DeviceInfo.EnvTeamLimit > 0 &&
+ DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
+ DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
+ DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
+ DeviceInfo.EnvTeamLimit);
+ }
+
+ DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
+ DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
+ DeviceInfo.WarpSize[device_id]);
+
+ // Set default number of teams
+ if (DeviceInfo.EnvNumTeams > 0) {
+ DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
+ DP("Default number of teams set according to environment %d\n",
+ DeviceInfo.EnvNumTeams);
+ } else {
+ DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
+ DP("Default number of teams set according to library's default %d\n",
+ RTLDeviceInfoTy::DefaultNumTeams);
+ }
+ if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
+ DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
+ DP("Default number of teams exceeds device limit, capping at %d\n",
+ DeviceInfo.BlocksPerGrid[device_id]);
+ }
+
+ // Set default number of threads
+ DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
+ DP("Default number of threads set according to library's default %d\n",
+ RTLDeviceInfoTy::DefaultNumThreads);
+ if (DeviceInfo.NumThreads[device_id] >
+ DeviceInfo.ThreadsPerBlock[device_id]) {
+ DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
+ DP("Default number of threads exceeds device limit, capping at %d\n",
+ DeviceInfo.ThreadsPerBlock[device_id]);
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+ __tgt_device_image *image) {
+
+ // Set the context we are using.
+ CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when setting a CUDA context for device %d\n", device_id);
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ // Clear the offload table as we are going to create a new one.
+ DeviceInfo.clearOffloadEntriesTable(device_id);
+
+ // Create the module and extract the function pointers.
+
+ CUmodule cumod;
+ DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
+ err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when loading CUDA module\n");
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ DP("CUDA module successfully loaded!\n");
+ DeviceInfo.Modules.push_back(cumod);
+
+ // Find the symbols in the module by name.
+ __tgt_offload_entry *HostBegin = image->EntriesBegin;
+ __tgt_offload_entry *HostEnd = image->EntriesEnd;
+
+ for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
+
+ if (!e->addr) {
+ // We return NULL when something like this happens, the host should have
+ // always something in the address to uniquely identify the target region.
+ DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
+
+ return NULL;
+ }
+
+ if (e->size) {
+ __tgt_offload_entry entry = *e;
+
+ CUdeviceptr cuptr;
+ size_t cusize;
+ err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
+
+ if (err != CUDA_SUCCESS) {
+ DP("Loading global '%s' (Failed)\n", e->name);
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ if (cusize != e->size) {
+ DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
+ cusize, e->size);
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
+ DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
+ entry.addr = (void *)cuptr;
+
+ DeviceInfo.addOffloadEntry(device_id, entry);
+
+ continue;
+ }
+
+ CUfunction fun;
+ err = cuModuleGetFunction(&fun, cumod, e->name);
+
+ if (err != CUDA_SUCCESS) {
+ DP("Loading '%s' (Failed)\n", e->name);
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
+ DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
+
+ // default value GENERIC (in case symbol is missing from cubin file)
+ int8_t ExecModeVal = ExecutionModeType::GENERIC;
+ std::string ExecModeNameStr (e->name);
+ ExecModeNameStr += "_exec_mode";
+ const char *ExecModeName = ExecModeNameStr.c_str();
+
+ CUdeviceptr ExecModePtr;
+ size_t cusize;
+ err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
+ if (err == CUDA_SUCCESS) {
+ if ((size_t)cusize != sizeof(int8_t)) {
+ DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
+ ExecModeName, cusize, sizeof(int8_t));
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when copying data from device to host. Pointers: "
+ "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
+ DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ if (ExecModeVal < 0 || ExecModeVal > 1) {
+ DP("Error wrong exec_mode value specified in cubin file: %d\n",
+ ExecModeVal);
+ return NULL;
+ }
+ } else {
+ DP("Loading global exec_mode '%s' - symbol missing, using default value "
+ "GENERIC (1)\n", ExecModeName);
+ CUDA_ERR_STRING(err);
+ }
+
+ KernelsList.push_back(KernelTy(fun, ExecModeVal));
+
+ __tgt_offload_entry entry = *e;
+ entry.addr = (void *)&KernelsList.back();
+ DeviceInfo.addOffloadEntry(device_id, entry);
+ }
+
+ // send device environment data to the device
+ {
+ omptarget_device_environmentTy device_env;
+
+ device_env.debug_level = 0;
+
+#ifdef OMPTARGET_DEBUG
+ if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
+ device_env.debug_level = std::stoi(envStr);
+ }
+#endif
+
+ const char * device_env_Name="omptarget_device_environment";
+ CUdeviceptr device_env_Ptr;
+ size_t cusize;
+
+ err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name);
+
+ if (err == CUDA_SUCCESS) {
+ if ((size_t)cusize != sizeof(device_env)) {
+ DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
+ device_env_Name, cusize, sizeof(int32_t));
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when copying data from host to device. Pointers: "
+ "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
+ DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize);
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ DP("Sending global device environment data %zu bytes\n", (size_t)cusize);
+ } else {
+ DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name);
+ DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n");
+ }
+ }
+
+ return DeviceInfo.getOffloadEntriesTable(device_id);
+}
+
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
+ if (size == 0) {
+ return NULL;
+ }
+
+ // Set the context we are using.
+ CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+ if (err != CUDA_SUCCESS) {
+ DP("Error while trying to set CUDA current context\n");
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ CUdeviceptr ptr;
+ err = cuMemAlloc(&ptr, size);
+ if (err != CUDA_SUCCESS) {
+ DP("Error while trying to allocate %d\n", err);
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ void *vptr = (void *)ptr;
+ return vptr;
+}
+
+int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
+ int64_t size) {
+ // Set the context we are using.
+ CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when setting CUDA context\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when copying data from host to device. Pointers: host = " DPxMOD
+ ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
+ DPxPTR(tgt_ptr), size);
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
+ int64_t size) {
+ // Set the context we are using.
+ CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when setting CUDA context\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when copying data from device to host. Pointers: host = " DPxMOD
+ ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
+ DPxPTR(tgt_ptr), size);
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
+ // Set the context we are using.
+ CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when setting CUDA context\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ err = cuMemFree((CUdeviceptr)tgt_ptr);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when freeing CUDA memory\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+ void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
+ int32_t thread_limit, uint64_t loop_tripcount) {
+ // Set the context we are using.
+ CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when setting CUDA context\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ // All args are references.
+ std::vector<void *> args(arg_num);
+ std::vector<void *> ptrs(arg_num);
+
+ for (int32_t i = 0; i < arg_num; ++i) {
+ ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
+ args[i] = &ptrs[i];
+ }
+
+ KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
+
+ int cudaThreadsPerBlock;
+
+ if (thread_limit > 0) {
+ cudaThreadsPerBlock = thread_limit;
+ DP("Setting CUDA threads per block to requested %d\n", thread_limit);
+ // Add master warp if necessary
+ if (KernelInfo->ExecutionMode == GENERIC) {
+ cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
+ DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
+ }
+ } else {
+ cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
+ DP("Setting CUDA threads per block to default %d\n",
+ DeviceInfo.NumThreads[device_id]);
+ }
+
+ if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
+ cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
+ DP("Threads per block capped at device limit %d\n",
+ DeviceInfo.ThreadsPerBlock[device_id]);
+ }
+
+ int kernel_limit;
+ err = cuFuncGetAttribute(&kernel_limit,
+ CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
+ if (err == CUDA_SUCCESS) {
+ if (kernel_limit < cudaThreadsPerBlock) {
+ cudaThreadsPerBlock = kernel_limit;
+ DP("Threads per block capped at kernel limit %d\n", kernel_limit);
+ }
+ }
+
+ int cudaBlocksPerGrid;
+ if (team_num <= 0) {
+ if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
+ if (KernelInfo->ExecutionMode == SPMD) {
+ // We have a combined construct, i.e. `target teams distribute parallel
+ // for [simd]`. We launch so many teams so that each thread will
+ // execute one iteration of the loop.
+ // round up to the nearest integer
+ cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
+ } else {
+ // If we reach this point, then we have a non-combined construct, i.e.
+ // `teams distribute` with a nested `parallel for` and each team is
+ // assigned one iteration of the `distribute` loop. E.g.:
+ //
+ // #pragma omp target teams distribute
+ // for(...loop_tripcount...) {
+ // #pragma omp parallel for
+ // for(...) {}
+ // }
+ //
+ // Threads within a team will execute the iterations of the `parallel`
+ // loop.
+ cudaBlocksPerGrid = loop_tripcount;
+ }
+ DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
+ "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
+ cudaThreadsPerBlock);
+ } else {
+ cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
+ DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
+ }
+ } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
+ cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
+ DP("Capping number of teams to team limit %d\n",
+ DeviceInfo.BlocksPerGrid[device_id]);
+ } else {
+ cudaBlocksPerGrid = team_num;
+ DP("Using requested number of teams %d\n", team_num);
+ }
+
+ // Run on the device.
+ DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
+ cudaThreadsPerBlock);
+
+ err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
+ cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
+ if (err != CUDA_SUCCESS) {
+ DP("Device kernel launch failed!\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ DP("Launch of entry point at " DPxMOD " successful!\n",
+ DPxPTR(tgt_entry_ptr));
+
+ CUresult sync_err = cuCtxSynchronize();
+ if (sync_err != CUDA_SUCCESS) {
+ DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr));
+ CUDA_ERR_STRING(sync_err);
+ return OFFLOAD_FAIL;
+ } else {
+ DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
+ void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
+ // use one team and the default number of threads.
+ const int32_t team_num = 1;
+ const int32_t thread_limit = 0;
+ return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
+ tgt_offsets, arg_num, team_num, thread_limit, 0);
+}
+
+#ifdef __cplusplus
+}
+#endif