diff options
Diffstat (limited to 'final/libomptarget/plugins')
-rw-r--r-- | final/libomptarget/plugins/CMakeLists.txt | 72 | ||||
-rw-r--r-- | final/libomptarget/plugins/aarch64/CMakeLists.txt | 18 | ||||
-rw-r--r-- | final/libomptarget/plugins/common/elf_common.c | 73 | ||||
-rw-r--r-- | final/libomptarget/plugins/cuda/CMakeLists.txt | 50 | ||||
-rw-r--r-- | final/libomptarget/plugins/cuda/src/rtl.cpp | 763 | ||||
-rw-r--r-- | final/libomptarget/plugins/exports | 15 | ||||
-rw-r--r-- | final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp | 340 | ||||
-rw-r--r-- | final/libomptarget/plugins/ppc64/CMakeLists.txt | 18 | ||||
-rw-r--r-- | final/libomptarget/plugins/ppc64le/CMakeLists.txt | 18 | ||||
-rw-r--r-- | final/libomptarget/plugins/x86_64/CMakeLists.txt | 18 |
10 files changed, 1385 insertions, 0 deletions
diff --git a/final/libomptarget/plugins/CMakeLists.txt b/final/libomptarget/plugins/CMakeLists.txt new file mode 100644 index 0000000..8c3d571 --- /dev/null +++ b/final/libomptarget/plugins/CMakeLists.txt @@ -0,0 +1,72 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build plugins for the user system if available. +# +##===----------------------------------------------------------------------===## + +# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id); +# - build a plugin for an ELF based generic 64-bit target based on libffi. +# - tmachine: name of the machine processor as used in the cmake build system. +# - tmachine_name: name of the machine to be printed with the debug messages. +# - tmachine_libname: machine name to be appended to the plugin library name. +macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") + if(LIBOMPTARGET_DEP_LIBELF_FOUND) + if(LIBOMPTARGET_DEP_LIBFFI_FOUND) + + libomptarget_say("Building ${tmachine_name} offloading plugin.") + + include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) + include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR}) + + # Define macro to be used as prefix of the runtime messages for this target. + add_definitions("-DTARGET_NAME=${tmachine_name}") + + # Define macro with the ELF ID for this target. + add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") + + add_library("omptarget.rtl.${tmachine_libname}" SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp) + + # Install plugin under the lib destination folder. + install(TARGETS "omptarget.rtl.${tmachine_libname}" + LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") + + target_link_libraries( + "omptarget.rtl.${tmachine_libname}" + ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + dl + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") + + # Report to the parent scope that we are building a plugin. + set(LIBOMPTARGET_SYSTEM_TARGETS + "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE) + + else(LIBOMPTARGET_DEP_LIBFFI_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.") + endif(LIBOMPTARGET_DEP_LIBFFI_FOUND) + else(LIBOMPTARGET_DEP_LIBELF_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.") + endif(LIBOMPTARGET_DEP_LIBELF_FOUND) +else() + libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.") +endif() +endmacro() + +add_subdirectory(aarch64) +add_subdirectory(cuda) +add_subdirectory(ppc64) +add_subdirectory(ppc64le) +add_subdirectory(x86_64) + +# Make sure the parent scope can see the plugins that will be created. +set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE) + diff --git a/final/libomptarget/plugins/aarch64/CMakeLists.txt b/final/libomptarget/plugins/aarch64/CMakeLists.txt new file mode 100644 index 0000000..e3a76b9 --- /dev/null +++ b/final/libomptarget/plugins/aarch64/CMakeLists.txt @@ -0,0 +1,18 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for an aarch64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183") +else() + libomptarget_say("Not building aarch64 offloading plugin: machine not found in the system.") +endif() diff --git a/final/libomptarget/plugins/common/elf_common.c b/final/libomptarget/plugins/common/elf_common.c new file mode 100644 index 0000000..dd85575 --- /dev/null +++ b/final/libomptarget/plugins/common/elf_common.c @@ -0,0 +1,73 @@ +//===-- elf_common.c - Common ELF functionality -------------------*- C -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Common ELF functionality for target plugins. +// Must be included in the plugin source file AFTER omptarget.h has been +// included and macro DP(...) has been defined. +// . +// +//===----------------------------------------------------------------------===// + +#if !(defined(_OMPTARGET_H_) && defined(DP)) +#error Include elf_common.c in the plugin source AFTER omptarget.h has been\ + included and macro DP(...) has been defined. +#endif + +#include <elf.h> +#include <libelf.h> + +// Check whether an image is valid for execution on target_id +static inline int32_t elf_check_machine(__tgt_device_image *image, + uint16_t target_id) { + + // Is the library version incompatible with the header file? + if (elf_version(EV_CURRENT) == EV_NONE) { + DP("Incompatible ELF library!\n"); + return 0; + } + + char *img_begin = (char *)image->ImageStart; + char *img_end = (char *)image->ImageEnd; + size_t img_size = img_end - img_begin; + + // Obtain elf handler + Elf *e = elf_memory(img_begin, img_size); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return 0; + } + + // Check if ELF is the right kind. + if (elf_kind(e) != ELF_K_ELF) { + DP("Unexpected ELF type!\n"); + return 0; + } + Elf64_Ehdr *eh64 = elf64_getehdr(e); + Elf32_Ehdr *eh32 = elf32_getehdr(e); + + if (!eh64 && !eh32) { + DP("Unable to get machine ID from ELF file!\n"); + elf_end(e); + return 0; + } + + uint16_t MachineID; + if (eh64 && !eh32) + MachineID = eh64->e_machine; + else if (eh32 && !eh64) + MachineID = eh32->e_machine; + else { + DP("Ambiguous ELF header!\n"); + elf_end(e); + return 0; + } + + elf_end(e); + return MachineID == target_id; +} diff --git a/final/libomptarget/plugins/cuda/CMakeLists.txt b/final/libomptarget/plugins/cuda/CMakeLists.txt new file mode 100644 index 0000000..7210eec --- /dev/null +++ b/final/libomptarget/plugins/cuda/CMakeLists.txt @@ -0,0 +1,50 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a CUDA machine if available. +# +##===----------------------------------------------------------------------===## +if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")) + libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64 or ppc64le hosts.") + return() +elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND) + libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.") + return() +elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND) + libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.") + return() +elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND) + libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.") + return() +endif() + +libomptarget_say("Building CUDA offloading plugin.") + +# Define the suffix for the runtime messaging dumps. +add_definitions(-DTARGET_NAME=CUDA) + +if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug) + add_definitions(-DCUDA_ERROR_REPORT) +endif() + +include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS}) +include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}) + +add_library(omptarget.rtl.cuda SHARED src/rtl.cpp) + +# Install plugin under the lib destination folder. +install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") + +target_link_libraries(omptarget.rtl.cuda + ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES} + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") + +# Report to the parent scope that we are building a plugin for CUDA. +set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE) diff --git a/final/libomptarget/plugins/cuda/src/rtl.cpp b/final/libomptarget/plugins/cuda/src/rtl.cpp new file mode 100644 index 0000000..d265a87 --- /dev/null +++ b/final/libomptarget/plugins/cuda/src/rtl.cpp @@ -0,0 +1,763 @@ +//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// RTL for CUDA machine +// +//===----------------------------------------------------------------------===// + +#include <cassert> +#include <cstddef> +#include <cuda.h> +#include <list> +#include <string> +#include <vector> + +#include "omptargetplugin.h" + +#ifndef TARGET_NAME +#define TARGET_NAME CUDA +#endif + +#ifdef OMPTARGET_DEBUG +static int DebugLevel = 0; + +#define GETNAME2(name) #name +#define GETNAME(name) GETNAME2(name) +#define DP(...) \ + do { \ + if (DebugLevel > 0) { \ + DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ + } \ + } while (false) +#else // OMPTARGET_DEBUG +#define DP(...) {} +#endif // OMPTARGET_DEBUG + +#include "../../common/elf_common.c" + +// Utility for retrieving and printing CUDA error string. +#ifdef CUDA_ERROR_REPORT +#define CUDA_ERR_STRING(err) \ + do { \ + const char *errStr; \ + cuGetErrorString(err, &errStr); \ + DP("CUDA error is: %s\n", errStr); \ + } while (0) +#else +#define CUDA_ERR_STRING(err) \ + {} +#endif + +/// Keep entries table per device. +struct FuncOrGblEntryTy { + __tgt_target_table Table; + std::vector<__tgt_offload_entry> Entries; +}; + +enum ExecutionModeType { + SPMD, // constructors, destructors, + // combined constructs (`teams distribute parallel for [simd]`) + GENERIC, // everything else + NONE +}; + +/// Use a single entity to encode a kernel and a set of flags +struct KernelTy { + CUfunction Func; + + // execution mode of kernel + // 0 - SPMD mode (without master warp) + // 1 - Generic mode (with master warp) + int8_t ExecutionMode; + + KernelTy(CUfunction _Func, int8_t _ExecutionMode) + : Func(_Func), ExecutionMode(_ExecutionMode) {} +}; + +/// Device envrionment data +/// Manually sync with the deviceRTL side for now, move to a dedicated header file later. +struct omptarget_device_environmentTy { + int32_t debug_level; +}; + +/// List that contains all the kernels. +/// FIXME: we may need this to be per device and per library. +std::list<KernelTy> KernelsList; + +/// Class containing all the device information. +class RTLDeviceInfoTy { + std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; + +public: + int NumberOfDevices; + std::vector<CUmodule> Modules; + std::vector<CUcontext> Contexts; + + // Device properties + std::vector<int> ThreadsPerBlock; + std::vector<int> BlocksPerGrid; + std::vector<int> WarpSize; + + // OpenMP properties + std::vector<int> NumTeams; + std::vector<int> NumThreads; + + // OpenMP Environment properties + int EnvNumTeams; + int EnvTeamLimit; + + //static int EnvNumThreads; + static const int HardTeamLimit = 1<<16; // 64k + static const int HardThreadLimit = 1024; + static const int DefaultNumTeams = 128; + static const int DefaultNumThreads = 128; + + // Record entry point associated with device + void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + E.Entries.push_back(entry); + } + + // Return true if the entry is associated with device + bool findOffloadEntry(int32_t device_id, void *addr) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + for (auto &it : E.Entries) { + if (it.addr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table + __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + int32_t size = E.Entries.size(); + + // Table is empty + if (!size) + return 0; + + __tgt_offload_entry *begin = &E.Entries[0]; + __tgt_offload_entry *end = &E.Entries[size - 1]; + + // Update table info according to the entries and return the pointer + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = ++end; + + return &E.Table; + } + + // Clear entries table for a device + void clearOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncGblEntries[device_id].emplace_back(); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + E.Entries.clear(); + E.Table.EntriesBegin = E.Table.EntriesEnd = 0; + } + + RTLDeviceInfoTy() { +#ifdef OMPTARGET_DEBUG + if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { + DebugLevel = std::stoi(envStr); + } +#endif // OMPTARGET_DEBUG + + DP("Start initializing CUDA\n"); + + CUresult err = cuInit(0); + if (err != CUDA_SUCCESS) { + DP("Error when initializing CUDA\n"); + CUDA_ERR_STRING(err); + return; + } + + NumberOfDevices = 0; + + err = cuDeviceGetCount(&NumberOfDevices); + if (err != CUDA_SUCCESS) { + DP("Error when getting CUDA device count\n"); + CUDA_ERR_STRING(err); + return; + } + + if (NumberOfDevices == 0) { + DP("There are no devices supporting CUDA.\n"); + return; + } + + FuncGblEntries.resize(NumberOfDevices); + Contexts.resize(NumberOfDevices); + ThreadsPerBlock.resize(NumberOfDevices); + BlocksPerGrid.resize(NumberOfDevices); + WarpSize.resize(NumberOfDevices); + NumTeams.resize(NumberOfDevices); + NumThreads.resize(NumberOfDevices); + + // Get environment variables regarding teams + char *envStr = getenv("OMP_TEAM_LIMIT"); + if (envStr) { + // OMP_TEAM_LIMIT has been set + EnvTeamLimit = std::stoi(envStr); + DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); + } else { + EnvTeamLimit = -1; + } + envStr = getenv("OMP_NUM_TEAMS"); + if (envStr) { + // OMP_NUM_TEAMS has been set + EnvNumTeams = std::stoi(envStr); + DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); + } else { + EnvNumTeams = -1; + } + } + + ~RTLDeviceInfoTy() { + // Close modules + for (auto &module : Modules) + if (module) { + CUresult err = cuModuleUnload(module); + if (err != CUDA_SUCCESS) { + DP("Error when unloading CUDA module\n"); + CUDA_ERR_STRING(err); + } + } + + // Destroy contexts + for (auto &ctx : Contexts) + if (ctx) { + CUresult err = cuCtxDestroy(ctx); + if (err != CUDA_SUCCESS) { + DP("Error when destroying CUDA context\n"); + CUDA_ERR_STRING(err); + } + } + } +}; + +static RTLDeviceInfoTy DeviceInfo; + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { + return elf_check_machine(image, 190); // EM_CUDA = 190. +} + +int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } + +int32_t __tgt_rtl_init_device(int32_t device_id) { + + CUdevice cuDevice; + DP("Getting device %d\n", device_id); + CUresult err = cuDeviceGet(&cuDevice, device_id); + if (err != CUDA_SUCCESS) { + DP("Error when getting CUDA device with id = %d\n", device_id); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + // Create the context and save it to use whenever this device is selected. + err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, + cuDevice); + if (err != CUDA_SUCCESS) { + DP("Error when creating a CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + // Query attributes to determine number of threads/block and blocks/grid. + int maxGridDimX; + err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, + cuDevice); + if (err != CUDA_SUCCESS) { + DP("Error getting max grid dimension, use default\n"); + DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; + } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) { + DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX; + DP("Using %d CUDA blocks per grid\n", maxGridDimX); + } else { + DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit; + DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " + "at the hard limit\n", + maxGridDimX, RTLDeviceInfoTy::HardTeamLimit); + } + + // We are only exploiting threads along the x axis. + int maxBlockDimX; + err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, + cuDevice); + if (err != CUDA_SUCCESS) { + DP("Error getting max block dimension, use default\n"); + DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; + } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) { + DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX; + DP("Using %d CUDA threads per block\n", maxBlockDimX); + } else { + DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit; + DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping" + "at the hard limit\n", + maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit); + } + + int warpSize; + err = + cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice); + if (err != CUDA_SUCCESS) { + DP("Error getting warp size, assume default\n"); + DeviceInfo.WarpSize[device_id] = 32; + } else { + DeviceInfo.WarpSize[device_id] = warpSize; + } + + // Adjust teams to the env variables + if (DeviceInfo.EnvTeamLimit > 0 && + DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) { + DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit; + DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", + DeviceInfo.EnvTeamLimit); + } + + DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", + DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id], + DeviceInfo.WarpSize[device_id]); + + // Set default number of teams + if (DeviceInfo.EnvNumTeams > 0) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; + DP("Default number of teams set according to environment %d\n", + DeviceInfo.EnvNumTeams); + } else { + DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; + DP("Default number of teams set according to library's default %d\n", + RTLDeviceInfoTy::DefaultNumTeams); + } + if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id]; + DP("Default number of teams exceeds device limit, capping at %d\n", + DeviceInfo.BlocksPerGrid[device_id]); + } + + // Set default number of threads + DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads; + DP("Default number of threads set according to library's default %d\n", + RTLDeviceInfoTy::DefaultNumThreads); + if (DeviceInfo.NumThreads[device_id] > + DeviceInfo.ThreadsPerBlock[device_id]) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id]; + DP("Default number of threads exceeds device limit, capping at %d\n", + DeviceInfo.ThreadsPerBlock[device_id]); + } + + return OFFLOAD_SUCCESS; +} + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting a CUDA context for device %d\n", device_id); + CUDA_ERR_STRING(err); + return NULL; + } + + // Clear the offload table as we are going to create a new one. + DeviceInfo.clearOffloadEntriesTable(device_id); + + // Create the module and extract the function pointers. + + CUmodule cumod; + DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart)); + err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL); + if (err != CUDA_SUCCESS) { + DP("Error when loading CUDA module\n"); + CUDA_ERR_STRING(err); + return NULL; + } + + DP("CUDA module successfully loaded!\n"); + DeviceInfo.Modules.push_back(cumod); + + // Find the symbols in the module by name. + __tgt_offload_entry *HostBegin = image->EntriesBegin; + __tgt_offload_entry *HostEnd = image->EntriesEnd; + + for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { + + if (!e->addr) { + // We return NULL when something like this happens, the host should have + // always something in the address to uniquely identify the target region. + DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size); + + return NULL; + } + + if (e->size) { + __tgt_offload_entry entry = *e; + + CUdeviceptr cuptr; + size_t cusize; + err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name); + + if (err != CUDA_SUCCESS) { + DP("Loading global '%s' (Failed)\n", e->name); + CUDA_ERR_STRING(err); + return NULL; + } + + if (cusize != e->size) { + DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name, + cusize, e->size); + CUDA_ERR_STRING(err); + return NULL; + } + + DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", + DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr)); + entry.addr = (void *)cuptr; + + DeviceInfo.addOffloadEntry(device_id, entry); + + continue; + } + + CUfunction fun; + err = cuModuleGetFunction(&fun, cumod, e->name); + + if (err != CUDA_SUCCESS) { + DP("Loading '%s' (Failed)\n", e->name); + CUDA_ERR_STRING(err); + return NULL; + } + + DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", + DPxPTR(e - HostBegin), e->name, DPxPTR(fun)); + + // default value GENERIC (in case symbol is missing from cubin file) + int8_t ExecModeVal = ExecutionModeType::GENERIC; + std::string ExecModeNameStr (e->name); + ExecModeNameStr += "_exec_mode"; + const char *ExecModeName = ExecModeNameStr.c_str(); + + CUdeviceptr ExecModePtr; + size_t cusize; + err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName); + if (err == CUDA_SUCCESS) { + if ((size_t)cusize != sizeof(int8_t)) { + DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", + ExecModeName, cusize, sizeof(int8_t)); + CUDA_ERR_STRING(err); + return NULL; + } + + err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from device to host. Pointers: " + "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", + DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize); + CUDA_ERR_STRING(err); + return NULL; + } + + if (ExecModeVal < 0 || ExecModeVal > 1) { + DP("Error wrong exec_mode value specified in cubin file: %d\n", + ExecModeVal); + return NULL; + } + } else { + DP("Loading global exec_mode '%s' - symbol missing, using default value " + "GENERIC (1)\n", ExecModeName); + CUDA_ERR_STRING(err); + } + + KernelsList.push_back(KernelTy(fun, ExecModeVal)); + + __tgt_offload_entry entry = *e; + entry.addr = (void *)&KernelsList.back(); + DeviceInfo.addOffloadEntry(device_id, entry); + } + + // send device environment data to the device + { + omptarget_device_environmentTy device_env; + + device_env.debug_level = 0; + +#ifdef OMPTARGET_DEBUG + if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { + device_env.debug_level = std::stoi(envStr); + } +#endif + + const char * device_env_Name="omptarget_device_environment"; + CUdeviceptr device_env_Ptr; + size_t cusize; + + err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name); + + if (err == CUDA_SUCCESS) { + if ((size_t)cusize != sizeof(device_env)) { + DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n", + device_env_Name, cusize, sizeof(int32_t)); + CUDA_ERR_STRING(err); + return NULL; + } + + err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from host to device. Pointers: " + "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", + DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize); + CUDA_ERR_STRING(err); + return NULL; + } + + DP("Sending global device environment data %zu bytes\n", (size_t)cusize); + } else { + DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name); + DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n"); + } + } + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { + if (size == 0) { + return NULL; + } + + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error while trying to set CUDA current context\n"); + CUDA_ERR_STRING(err); + return NULL; + } + + CUdeviceptr ptr; + err = cuMemAlloc(&ptr, size); + if (err != CUDA_SUCCESS) { + DP("Error while trying to allocate %d\n", err); + CUDA_ERR_STRING(err); + return NULL; + } + + void *vptr = (void *)ptr; + return vptr; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from host to device. Pointers: host = " DPxMOD + ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), + DPxPTR(tgt_ptr), size); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from device to host. Pointers: host = " DPxMOD + ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), + DPxPTR(tgt_ptr), size); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + err = cuMemFree((CUdeviceptr)tgt_ptr); + if (err != CUDA_SUCCESS) { + DP("Error when freeing CUDA memory\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, + int32_t thread_limit, uint64_t loop_tripcount) { + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + // All args are references. + std::vector<void *> args(arg_num); + std::vector<void *> ptrs(arg_num); + + for (int32_t i = 0; i < arg_num; ++i) { + ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); + args[i] = &ptrs[i]; + } + + KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; + + int cudaThreadsPerBlock; + + if (thread_limit > 0) { + cudaThreadsPerBlock = thread_limit; + DP("Setting CUDA threads per block to requested %d\n", thread_limit); + // Add master warp if necessary + if (KernelInfo->ExecutionMode == GENERIC) { + cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; + DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); + } + } else { + cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; + DP("Setting CUDA threads per block to default %d\n", + DeviceInfo.NumThreads[device_id]); + } + + if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) { + cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id]; + DP("Threads per block capped at device limit %d\n", + DeviceInfo.ThreadsPerBlock[device_id]); + } + + int kernel_limit; + err = cuFuncGetAttribute(&kernel_limit, + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func); + if (err == CUDA_SUCCESS) { + if (kernel_limit < cudaThreadsPerBlock) { + cudaThreadsPerBlock = kernel_limit; + DP("Threads per block capped at kernel limit %d\n", kernel_limit); + } + } + + int cudaBlocksPerGrid; + if (team_num <= 0) { + if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { + if (KernelInfo->ExecutionMode == SPMD) { + // We have a combined construct, i.e. `target teams distribute parallel + // for [simd]`. We launch so many teams so that each thread will + // execute one iteration of the loop. + // round up to the nearest integer + cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; + } else { + // If we reach this point, then we have a non-combined construct, i.e. + // `teams distribute` with a nested `parallel for` and each team is + // assigned one iteration of the `distribute` loop. E.g.: + // + // #pragma omp target teams distribute + // for(...loop_tripcount...) { + // #pragma omp parallel for + // for(...) {} + // } + // + // Threads within a team will execute the iterations of the `parallel` + // loop. + cudaBlocksPerGrid = loop_tripcount; + } + DP("Using %d teams due to loop trip count %" PRIu64 " and number of " + "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, + cudaThreadsPerBlock); + } else { + cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id]; + DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]); + } + } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) { + cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id]; + DP("Capping number of teams to team limit %d\n", + DeviceInfo.BlocksPerGrid[device_id]); + } else { + cudaBlocksPerGrid = team_num; + DP("Using requested number of teams %d\n", team_num); + } + + // Run on the device. + DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, + cudaThreadsPerBlock); + + err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, + cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0); + if (err != CUDA_SUCCESS) { + DP("Device kernel launch failed!\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + DP("Launch of entry point at " DPxMOD " successful!\n", + DPxPTR(tgt_entry_ptr)); + + CUresult sync_err = cuCtxSynchronize(); + if (sync_err != CUDA_SUCCESS) { + DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr)); + CUDA_ERR_STRING(sync_err); + return OFFLOAD_FAIL; + } else { + DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); + } + + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { + // use one team and the default number of threads. + const int32_t team_num = 1; + const int32_t thread_limit = 0; + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + tgt_offsets, arg_num, team_num, thread_limit, 0); +} + +#ifdef __cplusplus +} +#endif diff --git a/final/libomptarget/plugins/exports b/final/libomptarget/plugins/exports new file mode 100644 index 0000000..3f9f7d4 --- /dev/null +++ b/final/libomptarget/plugins/exports @@ -0,0 +1,15 @@ +VERS1.0 { + global: + __tgt_rtl_is_valid_binary; + __tgt_rtl_number_of_devices; + __tgt_rtl_init_device; + __tgt_rtl_load_binary; + __tgt_rtl_data_alloc; + __tgt_rtl_data_submit; + __tgt_rtl_data_retrieve; + __tgt_rtl_data_delete; + __tgt_rtl_run_target_team_region; + __tgt_rtl_run_target_region; + local: + *; +}; diff --git a/final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp new file mode 100644 index 0000000..951710a --- /dev/null +++ b/final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp @@ -0,0 +1,340 @@ +//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// RTL for generic 64-bit machine +// +//===----------------------------------------------------------------------===// + +#include <cassert> +#include <cstdio> +#include <cstring> +#include <cstdlib> +#include <dlfcn.h> +#include <ffi.h> +#include <gelf.h> +#include <link.h> +#include <list> +#include <string> +#include <vector> + +#include "omptargetplugin.h" + +#ifndef TARGET_NAME +#define TARGET_NAME Generic ELF - 64bit +#endif + +#ifndef TARGET_ELF_ID +#define TARGET_ELF_ID 0 +#endif + +#ifdef OMPTARGET_DEBUG +static int DebugLevel = 0; + +#define GETNAME2(name) #name +#define GETNAME(name) GETNAME2(name) +#define DP(...) \ + do { \ + if (DebugLevel > 0) { \ + DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ + } \ + } while (false) +#else // OMPTARGET_DEBUG +#define DP(...) {} +#endif // OMPTARGET_DEBUG + +#include "../../common/elf_common.c" + +#define NUMBER_OF_DEVICES 4 +#define OFFLOADSECTIONNAME ".omp_offloading.entries" + +/// Array of Dynamic libraries loaded for this target. +struct DynLibTy { + char *FileName; + void *Handle; +}; + +/// Keep entries table per device. +struct FuncOrGblEntryTy { + __tgt_target_table Table; +}; + +/// Class containing all the device information. +class RTLDeviceInfoTy { + std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; + +public: + std::list<DynLibTy> DynLibs; + + // Record entry point associated with device. + void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin, + __tgt_offload_entry *end) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncGblEntries[device_id].emplace_back(); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = end; + } + + // Return true if the entry is associated with device. + bool findOffloadEntry(int32_t device_id, void *addr) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd; + i < e; ++i) { + if (i->addr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table. + __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + return &E.Table; + } + + RTLDeviceInfoTy(int32_t num_devices) { +#ifdef OMPTARGET_DEBUG + if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { + DebugLevel = std::stoi(envStr); + } +#endif // OMPTARGET_DEBUG + + FuncGblEntries.resize(num_devices); + } + + ~RTLDeviceInfoTy() { + // Close dynamic libraries + for (auto &lib : DynLibs) { + if (lib.Handle) { + dlclose(lib.Handle); + remove(lib.FileName); + } + } + } +}; + +static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES); + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { +// If we don't have a valid ELF ID we can just fail. +#if TARGET_ELF_ID < 1 + return 0; +#else + return elf_check_machine(image, TARGET_ELF_ID); +#endif +} + +int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; } + +int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; } + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + + DP("Dev %d: load binary from " DPxMOD " image\n", device_id, + DPxPTR(image->ImageStart)); + + assert(device_id >= 0 && device_id < NUMBER_OF_DEVICES && "bad dev id"); + + size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart; + size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin); + DP("Expecting to have %zd entries defined.\n", NumEntries); + + // Is the library version incompatible with the header file? + if (elf_version(EV_CURRENT) == EV_NONE) { + DP("Incompatible ELF library!\n"); + return NULL; + } + + // Obtain elf handler + Elf *e = elf_memory((char *)image->ImageStart, ImageSize); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return NULL; + } + + if (elf_kind(e) != ELF_K_ELF) { + DP("Invalid Elf kind!\n"); + elf_end(e); + return NULL; + } + + // Find the entries section offset + Elf_Scn *section = 0; + Elf64_Off entries_offset = 0; + + size_t shstrndx; + + if (elf_getshdrstrndx(e, &shstrndx)) { + DP("Unable to get ELF strings index!\n"); + elf_end(e); + return NULL; + } + + while ((section = elf_nextscn(e, section))) { + GElf_Shdr hdr; + gelf_getshdr(section, &hdr); + + if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) { + entries_offset = hdr.sh_addr; + break; + } + } + + if (!entries_offset) { + DP("Entries Section Offset Not Found\n"); + elf_end(e); + return NULL; + } + + DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset)); + + // load dynamic library and get the entry points. We use the dl library + // to do the loading of the library, but we could do it directly to avoid the + // dump to the temporary file. + // + // 1) Create tmp file with the library contents. + // 2) Use dlopen to load the file and dlsym to retrieve the symbols. + char tmp_name[] = "/tmp/tmpfile_XXXXXX"; + int tmp_fd = mkstemp(tmp_name); + + if (tmp_fd == -1) { + elf_end(e); + return NULL; + } + + FILE *ftmp = fdopen(tmp_fd, "wb"); + + if (!ftmp) { + elf_end(e); + return NULL; + } + + fwrite(image->ImageStart, ImageSize, 1, ftmp); + fclose(ftmp); + + DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_LAZY)}; + + if (!Lib.Handle) { + DP("Target library loading error: %s\n", dlerror()); + elf_end(e); + return NULL; + } + + DeviceInfo.DynLibs.push_back(Lib); + + struct link_map *libInfo = (struct link_map *)Lib.Handle; + + // The place where the entries info is loaded is the library base address + // plus the offset determined from the ELF file. + Elf64_Addr entries_addr = libInfo->l_addr + entries_offset; + + DP("Pointer to first entry to be loaded is (" DPxMOD ").\n", + DPxPTR(entries_addr)); + + // Table of pointers to all the entries in the target. + __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr; + + __tgt_offload_entry *entries_begin = &entries_table[0]; + __tgt_offload_entry *entries_end = entries_begin + NumEntries; + + if (!entries_begin) { + DP("Can't obtain entries begin\n"); + elf_end(e); + return NULL; + } + + DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n", + DPxPTR(entries_begin), DPxPTR(entries_end)); + DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end); + + elf_end(e); + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { + void *ptr = malloc(size); + return ptr; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + memcpy(tgt_ptr, hst_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + memcpy(hst_ptr, tgt_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { + free(tgt_ptr); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, + int32_t thread_limit, uint64_t loop_tripcount /*not used*/) { + // ignore team num and thread limit. + + // Use libffi to launch execution. + ffi_cif cif; + + // All args are references. + std::vector<ffi_type *> args_types(arg_num, &ffi_type_pointer); + std::vector<void *> args(arg_num); + std::vector<void *> ptrs(arg_num); + + for (int32_t i = 0; i < arg_num; ++i) { + ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); + args[i] = &ptrs[i]; + } + + ffi_status status = ffi_prep_cif(&cif, FFI_DEFAULT_ABI, arg_num, + &ffi_type_void, &args_types[0]); + + assert(status == FFI_OK && "Unable to prepare target launch!"); + + if (status != FFI_OK) + return OFFLOAD_FAIL; + + DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr)); + + void (*entry)(void); + *((void**) &entry) = tgt_entry_ptr; + ffi_call(&cif, entry, NULL, &args[0]); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { + // use one team and one thread. + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + tgt_offsets, arg_num, 1, 1, 0); +} + +#ifdef __cplusplus +} +#endif diff --git a/final/libomptarget/plugins/ppc64/CMakeLists.txt b/final/libomptarget/plugins/ppc64/CMakeLists.txt new file mode 100644 index 0000000..6849a03 --- /dev/null +++ b/final/libomptarget/plugins/ppc64/CMakeLists.txt @@ -0,0 +1,18 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a ppc64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21") +else() + libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.") +endif()
\ No newline at end of file diff --git a/final/libomptarget/plugins/ppc64le/CMakeLists.txt b/final/libomptarget/plugins/ppc64le/CMakeLists.txt new file mode 100644 index 0000000..87cefdf --- /dev/null +++ b/final/libomptarget/plugins/ppc64le/CMakeLists.txt @@ -0,0 +1,18 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a ppc64le machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21") +else() + libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.") +endif()
\ No newline at end of file diff --git a/final/libomptarget/plugins/x86_64/CMakeLists.txt b/final/libomptarget/plugins/x86_64/CMakeLists.txt new file mode 100644 index 0000000..bdd5bba --- /dev/null +++ b/final/libomptarget/plugins/x86_64/CMakeLists.txt @@ -0,0 +1,18 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a x86_64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62") +else() + libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.") +endif()
\ No newline at end of file |